summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp27
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp10
-rw-r--r--llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll132
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll94
-rw-r--r--llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-hi16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-lo16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll109
-rw-r--r--llvm/test/CodeGen/AMDGPU/nested-calls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sibling-call.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll58
-rw-r--r--llvm/test/CodeGen/AMDGPU/stack-realign.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-hi16.ll6
19 files changed, 496 insertions, 442 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7f3150bdd01..48f5f645075 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -773,22 +773,17 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
!AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
assert(RS && "RegScavenger required if spilling");
- // We force this to be at offset 0 so no user object ever has 0 as an
- // address, so we may use 0 as an invalid pointer value. This is because
- // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
- // is required to be address space 0, we are forced to accept this for
- // now. Ideally we could have the stack in another address space with 0 as a
- // valid pointer, and -1 as the null value.
- //
- // This will also waste additional space when user stack objects require > 4
- // byte alignment.
- //
- // The main cost here is losing the offset for addressing modes. However
- // this also ensures we shouldn't need a register for the offset when
- // emergency scavenging.
- int ScavengeFI = MFI.CreateFixedObject(
- TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
- RS->addScavengingFrameIndex(ScavengeFI);
+ if (FuncInfo->isEntryFunction()) {
+ int ScavengeFI = MFI.CreateFixedObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ } else {
+ int ScavengeFI = MFI.CreateStackObject(
+ TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
+ false);
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8a08bc463da..f5fcb7cdfe0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1940,12 +1940,6 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
- if (!IsEntryFunc) {
- // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
- // this when allocating argument fixed offsets.
- CCInfo.AllocateStack(4, 4);
- }
-
if (IsShader) {
processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
@@ -2551,7 +2545,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported call from graphics shader of function ");
}
- // The first 4 bytes are reserved for the callee's emergency stack slot.
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2578,9 +2571,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- // The first 4 bytes are reserved for the callee's emergency stack slot.
- CCInfo.AllocateStack(4, 4);
-
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index c21abaeaaf5..c0f86da2148 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -4,14 +4,14 @@
%struct.ByValStruct = type { [4 x i32] }
; GCN-LABEL: {{^}}void_func_byval_struct:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
; GCN-NOT: s32
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:20{{$}}
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
; GCN-NOT: s32
define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
entry:
@@ -34,16 +34,16 @@ entry:
; GCN-DAG: buffer_store_dword v33
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
; GCN-DAG: v_writelane_b32
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
; GCN: s_swappc_b64
-; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:20{{$}}
+; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
; GCN: v_readlane_b32
; GCN-NOT: v_readlane_b32 s32
@@ -74,31 +74,31 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
; GCN-NOT: s_add_u32 s32, s32, 0x800
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
; GCN: s_swappc_b64
; GCN-NOT: v_readlane_b32 s32
@@ -144,20 +144,20 @@ entry:
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
; GCN: s_swappc_b64
@@ -182,14 +182,14 @@ entry:
}
; GCN-LABEL: {{^}}void_func_byval_struct_align8:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
; GCN-NOT: s32
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:24{{$}}
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
; GCN-NOT: s32
define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
entry:
@@ -222,20 +222,20 @@ entry:
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
; GCN: s_swappc_b64
@@ -267,30 +267,30 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
; GCN-NOT: s_add_u32 s32, s32, 0x800
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: s_waitcnt vmcnt(0)
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
; GCN: s_swappc_b64
; GCN-NOT: v_readlane_b32 s32
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 79a238a287d..9432031527c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -636,7 +636,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt
-; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
@@ -687,15 +687,15 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
; GCN-NEXT: s_swappc_b64
; GCN-NOT: [[SP]]
@@ -722,8 +722,8 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
; GCN-NOT: s_add_u32 [[SP]]
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
; GCN: s_swappc_b64
; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
@@ -758,8 +758,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
}
; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:8
; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
entry:
@@ -769,15 +769,15 @@ entry:
; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:20
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN: s_getpc_b64
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@@ -789,15 +789,15 @@ entry:
; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GCN: s_getpc_b64
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
@@ -808,13 +808,13 @@ entry:
; GCN-LABEL: {{^}}stack_12xv3i32:
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 11
; GCN: s_getpc
define void @stack_12xv3i32() #0 {
@@ -837,13 +837,13 @@ entry:
; GCN-LABEL: {{^}}stack_12xv3f32:
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
; GCN: v_mov_b32_e32 v31, 0x41300000
; GCN: s_getpc
define void @stack_12xv3f32() #0 {
@@ -867,21 +867,21 @@ entry:
; GCN-LABEL: {{^}}stack_8xv5i32:
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN: v_mov_b32_e32 v31, 7
; GCN: s_getpc
@@ -901,21 +901,21 @@ entry:
; GCN-LABEL: {{^}}stack_8xv5f32:
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
; GCN: v_mov_b32_e32 v31, 0x40e00000
; GCN: s_getpc
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 70c69d9f6c0..f1d10aca515 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -23,7 +23,7 @@ define void @callee_no_stack_no_fp_elim() #1 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4{{$}}
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack() #0 {
@@ -37,13 +37,13 @@ define void @callee_with_stack() #0 {
; GCN-NEXT: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN-DAG: v_writelane_b32 v32, s33,
; GCN-DAG: v_writelane_b32 v32, s34,
; GCN-DAG: v_writelane_b32 v32, s35,
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}}
; GCN-DAG: s_mov_b32 s33, s5
@@ -52,7 +52,7 @@ define void @callee_with_stack() #0 {
; GCN-DAG: v_readlane_b32 s35,
; GCN-DAG: v_readlane_b32 s34,
; GCN-DAG: v_readlane_b32 s33,
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_and_call() #0 {
@@ -72,7 +72,7 @@ define void @callee_with_stack_and_call() #0 {
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 0
; GCN-DAG: v_writelane_b32 v32, s34, 1
@@ -84,7 +84,7 @@ define void @callee_with_stack_and_call() #0 {
; GCN-DAG: v_readlane_b32 s33, v32, 0
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN: s_sub_u32 s32, s32, 0x400
@@ -99,7 +99,7 @@ declare void @external_void_func_void() #0
; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN: v_writelane_b32 v32
@@ -107,7 +107,7 @@ declare void @external_void_func_void() #0
; GCN: v_readlane_b32 s{{[0-9]+}}, v32
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index fa5d20a1e96..02203cfe272 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -116,7 +116,7 @@ define void @use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
; GCN: s_waitcnt
; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: ; use s6
; GCN: s_setpc_b64
define void @use_stack_workgroup_id_x() #1 {
@@ -429,7 +429,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
}
; GCN-LABEL: {{^}}use_every_sgpr_input:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
@@ -577,7 +577,7 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
; GCN: s_swappc_b64
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index aa33dfa3675..7db1070faea 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -230,11 +230,11 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_x(
@@ -289,7 +289,7 @@ define void @too_many_args_use_workitem_id_x(
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN: s_mov_b32 s4, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
@@ -307,7 +307,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -326,14 +326,14 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
; GCN: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN: s_sub_u32 s32, s32, 0x400{{$}}
; GCN: s_setpc_b64
define void @too_many_args_call_too_many_args_use_workitem_id_x(
@@ -350,18 +350,17 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
}
; stack layout:
-; frame[0] = emergency stack slot
-; frame[1] = byval arg32
-; frame[2] = stack passed workitem ID x
-; frame[3] = VGPR spill slot
+; frame[0] = byval arg32
+; frame[1] = stack passed workitem ID x
+; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: s_waitcnt
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
-; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@@ -410,13 +409,9 @@ define void @too_many_args_use_workitem_id_x_byval(
ret void
}
-; frame[0] = emergency stack slot
-; frame[1] =
-
-; sp[0] = callee emergency stack slot reservation
-; sp[1] = byval
-; sp[2] = ??
-; sp[3] = stack passed workitem ID x
+; sp[0] = byval
+; sp[1] = ??
+; sp[2] = stack passed workitem ID x
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
; GCN: enable_vgpr_workitem_id = 0
@@ -427,10 +422,10 @@ define void @too_many_args_use_workitem_id_x_byval(
; GCN-NOT: s32
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
@@ -451,11 +446,11 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
@@ -475,15 +470,15 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @too_many_args_use_workitem_id_xyz(
@@ -537,10 +532,9 @@ define void @too_many_args_use_workitem_id_xyz(
ret void
}
-; frame[0] = callee emergency stack slot
-; frame[1] = ID X
-; frame[2] = ID Y
-; frame[3] = ID Z
+; frame[0] = ID X
+; frame[1] = ID Y
+; frame[2] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
@@ -548,9 +542,9 @@ define void @too_many_args_use_workitem_id_xyz(
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
@@ -567,15 +561,14 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
; workitem ID X in register, yz on stack
; v31 = workitem ID X
-; frame[0] = emergency slot
-; frame[1] = workitem Y
-; frame[2] = workitem Z
+; frame[0] = workitem Y
+; frame[1] = workitem Z
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
; GCN: s_waitcnt
@@ -631,9 +624,8 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
ret void
}
-; frame[0] = callee emergency stack slot
-; frame[1] = ID Y
-; frame[2] = ID Z
+; frame[0] = ID Y
+; frame[1] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
@@ -642,8 +634,8 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
; GCN: s_mov_b32 s32, s33
; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
call void @too_many_args_use_workitem_id_x_stack_yz(
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 4b38fb8e68d..7fd669c98c5 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -30,7 +30,7 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
@@ -47,7 +47,7 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -68,7 +68,7 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
@@ -85,7 +85,7 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -106,7 +106,7 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
@@ -123,7 +123,7 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -144,7 +144,7 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
@@ -162,7 +162,7 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 819b0e5bfc5..450146f23bb 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -9,11 +9,8 @@
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_sub_u32 s6, s32, s4
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
-
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
@@ -23,6 +20,36 @@ define void @func_mov_fi_i32() #0 {
ret void
}
+; Offset due to different objects
+; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+
+; CI: s_sub_u32 s6, s32, s4
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
+
+; CI: s_sub_u32 s6, s32, s4
+; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]]
+; CI-NOT: v_mov
+; CI: ds_write_b32 v0, v0
+; CI-NEXT: ds_write_b32 v0, v1
+
+; GFX9: s_sub_u32 s6, s32, s4
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
+; GFX9-DAG: ds_write_b32 v0, v0
+
+; GFX9-DAG: s_sub_u32 s6, s32, s4
+; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
+; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-NEXT: ds_write_b32 v0, v0
+define void @func_mov_fi_i32_offset() #0 {
+ %alloca0 = alloca i32, addrspace(5)
+ %alloca1 = alloca i32, addrspace(5)
+ store volatile i32 addrspace(5)* %alloca0, i32 addrspace(5)* addrspace(3)* undef
+ store volatile i32 addrspace(5)* %alloca1, i32 addrspace(5)* addrspace(3)* undef
+ ret void
+}
+
; Materialize into an add of a constant offset from the FI.
; FIXME: Should be able to merge adds
@@ -31,12 +58,10 @@ define void @func_mov_fi_i32() #0 {
; GCN: s_sub_u32 s6, s32, s4
; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
; GCN-NOT: v_mov
@@ -54,11 +79,9 @@ define void @func_add_constant_to_fi_i32() #0 {
; GCN-LABEL: {{^}}func_other_fi_user_i32:
; GCN: s_sub_u32 s6, s32, s4
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
; GCN-NOT: v_mov
@@ -92,12 +115,10 @@ define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
-; CI-NEXT: v_add_i32_e64 [[ADD:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[ADD]]
+; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
-; GFX9-NEXT: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]]
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
+; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
@@ -130,17 +151,15 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
-; CI: v_add_i32_e64 [[ADD:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
-; GFX9: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]]
; GCN: s_and_saveexec_b64
-; CI: v_add_i32_e32 v0, vcc, 4, [[ADD]]
+; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
; CI: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4{{$}}
-; GFX9: v_add_u32_e32 v0, 4, [[ADD]]
+; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s4 offen offset:4{{$}}
; GCN: ds_write_b32
@@ -162,7 +181,7 @@ ret:
; Added offset can't be used with VOP3 add
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
; GCN: s_sub_u32 s6, s32, s4
-; GCN-DAG: s_movk_i32 s6, 0x204
+; GCN-DAG: s_movk_i32 s6, 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]]
@@ -186,7 +205,7 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4
-; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204
+; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
@@ -241,7 +260,16 @@ bb5:
; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
; GCN: s_and_saveexec_b64
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
+
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
+; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+
+; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
+; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+
+; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
%alloca0 = alloca { i8, i32 }, align 4, addrspace(5)
%cmp = icmp eq i32 %arg0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 9186b91f763..1faf0374dbc 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -516,8 +516,8 @@ define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
-; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
; GCN-DAG: buffer_store_dword v[[ELT1]]
; GCN-DAG: buffer_store_byte v[[ELT0]]
define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 {
@@ -527,10 +527,10 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
-; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
; GCN: ds_write_b32 v0, v0
; GCN: s_setpc_b64
@@ -544,7 +544,7 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval %a
}
; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
-; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
@@ -566,9 +566,9 @@ define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval %arg0, i64 ad
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:8
; GCN: buffer_store_dword v[[LOAD_ARG1]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
@@ -581,14 +581,14 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
; FIXME: Different ext load types on CI vs. VI
; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
-; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
@@ -609,10 +609,10 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
}
; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
@@ -636,15 +636,15 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
}
; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
@@ -656,15 +656,15 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
}
; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
@@ -676,23 +676,23 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
}
; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
@@ -706,39 +706,39 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
}
; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:68{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:72{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:76{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:80{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:84{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:88{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:92{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:96{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:100{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:104{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:108{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:112{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:116{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:128{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index 07ae3d00d07..599c978c51c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -503,7 +503,7 @@ entry:
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
entry:
- %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+ %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
%load = load i16, i16 addrspace(5)* %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
@@ -522,7 +522,7 @@ entry:
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
entry:
- %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
+ %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
%load = load half, half addrspace(5)* %gep
%build0 = insertelement <2 x half> undef, half %reg, i32 0
%build1 = insertelement <2 x half> %build0, half %load, i32 1
@@ -577,7 +577,7 @@ entry:
; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
entry:
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -597,7 +597,7 @@ entry:
; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
entry:
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = zext i8 %load to i16
%bitcast = bitcast i16 %ext to half
@@ -618,7 +618,7 @@ entry:
; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
entry:
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = sext i8 %load to i16
%bitcast = bitcast i16 %ext to half
@@ -639,7 +639,7 @@ entry:
; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
entry:
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -796,7 +796,7 @@ entry:
%obj1 = alloca [4096 x i16], align 2, addrspace(5)
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
%load = load i16, i16 addrspace(5)* %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
@@ -813,7 +813,7 @@ entry:
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load i8, i8 addrspace(5)* %gep
%ext = sext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -831,7 +831,7 @@ entry:
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load i8, i8 addrspace(5)* %gep
%ext = zext i8 %load to i16
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -975,9 +975,9 @@ entry:
; FIXME: Is there a cost to using the extload over not?
; GCN-LABEL: {{^}}load_private_v2i16_split:
; GCN: s_waitcnt
-; GFX900: buffer_load_ushort v0, off, s[0:3], s32 offset:4{{$}}
+; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:6
+; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 47f5ea35550..e1ff8f8ffd3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -600,7 +600,7 @@ entry:
define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
- %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+ %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
%load = load i16, i16 addrspace(5)* %gep
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
@@ -621,7 +621,7 @@ entry:
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
entry:
- %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+ %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
%load = load i16, i16 addrspace(5)* %gep
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
@@ -641,7 +641,7 @@ entry:
define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
entry:
%reg.bc = bitcast i32 %reg to <2 x half>
- %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
+ %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
%load = load half, half addrspace(5)* %gep
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
store <2 x half> %build1, <2 x half> addrspace(1)* undef
@@ -714,7 +714,7 @@ entry:
define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = zext i8 %load to i16
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
@@ -734,7 +734,7 @@ entry:
define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
- %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+ %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
%load = load i8, i8 addrspace(5)* %gep
%ext = sext i8 %load to i16
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
@@ -905,7 +905,7 @@ entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
%load = load volatile i16, i16 addrspace(5)* %gep
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
@@ -924,7 +924,7 @@ entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load volatile i8, i8 addrspace(5)* %gep
%load.ext = sext i8 %load to i16
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
@@ -944,7 +944,7 @@ entry:
%reg.bc = bitcast i32 %reg to <2 x i16>
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load volatile i8, i8 addrspace(5)* %gep
%load.ext = zext i8 %load to i16
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
@@ -964,7 +964,7 @@ entry:
%reg.bc = bitcast i32 %reg to <2 x half>
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load volatile i8, i8 addrspace(5)* %gep
%load.ext = sext i8 %load to i16
%bitcast = bitcast i16 %load.ext to half
@@ -985,7 +985,7 @@ entry:
%reg.bc = bitcast i32 %reg to <2 x half>
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
store volatile i32 123, i32 addrspace(5)* %bc
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%load = load volatile i8, i8 addrspace(5)* %gep
%load.ext = zext i8 %load to i16
%bitcast = bitcast i16 %load.ext to half
diff --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
index 3c55dc8ef91..e2e77975c43 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
@@ -1,12 +1,12 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
; CHECK-LABEL: spill_v2i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
define void @spill_v2i32() {
entry:
@@ -25,12 +25,12 @@ entry:
}
; CHECK-LABEL: spill_v2f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
define void @spill_v2f32() {
entry:
@@ -49,14 +49,14 @@ entry:
}
; CHECK-LABEL: spill_v3i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
define void @spill_v3i32() {
entry:
@@ -75,14 +75,14 @@ entry:
}
; CHECK-LABEL: spill_v3f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
define void @spill_v3f32() {
entry:
@@ -101,16 +101,16 @@ entry:
}
; CHECK-LABEL: spill_v4i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
define void @spill_v4i32() {
entry:
@@ -129,16 +129,16 @@ entry:
}
; CHECK-LABEL: spill_v4f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
define void @spill_v4f32() {
entry:
@@ -157,17 +157,16 @@ entry:
}
; CHECK-LABEL: spill_v5i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
-
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
define void @spill_v5i32() {
entry:
%alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
@@ -185,17 +184,16 @@ entry:
}
; CHECK-LABEL: spill_v5f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
; CHECK: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
-
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
define void @spill_v5f32() {
entry:
%alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
@@ -211,6 +209,3 @@ entry:
ret void
}
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 66e6988fbe6..0a27e9ca122 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -13,7 +13,7 @@ declare void @external_void_func_i32(i32) #0
; GCN-DAG: s_add_u32 s32, s32, 0x400
; Spill CSR VGPR used for SGPR spilling
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 0
@@ -26,7 +26,7 @@ declare void @external_void_func_i32(i32) #0
; GCN: v_readlane_b32 s34, v32, 1
; GCN: v_readlane_b32 s33, v32, 0
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN: s_sub_u32 s32, s32, 0x400
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 0892bb8549f..d2923010d31 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -19,7 +19,7 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
; GCN: s_waitcnt vmcnt(0)
; GCN: s_setpc_b64
; GCN: ; ScratchSize: 68
@@ -40,7 +40,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
; GCN: s_setpc_b64
; GCN: ; ScratchSize: 68
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
@@ -54,7 +54,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
; GCN: s_setpc_b64
; GCN: ; ScratchSize: 136
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
@@ -84,7 +84,7 @@ entry:
; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
; GCN-NEXT: s_waitcnt vmcnt(0)
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
@@ -100,7 +100,7 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)*
; Tail call disallowed with byval in parent.
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; GCN-NOT: v_readlane_b32 s32
; GCN: s_setpc_b64
@@ -110,14 +110,17 @@ entry:
ret i32 %ret
}
-; Tail call disallowed with byval in parent, not callee.
+; Tail call disallowed with byval in parent, not callee. The stack
+; usage of incoming arguments must be <= the outgoing stack
+; arguments.
+
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
; GCN-NOT: v0
; GCN-NOT: s32
; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN-NEXT: s_setpc_b64
-define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 {
+define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
entry:
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*))
ret i32 %ret
@@ -125,8 +128,8 @@ entry:
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
@@ -149,19 +152,19 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
; FIXME: Why load and store same location for stack args?
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
; GCN-NOT: s32
-; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
@@ -173,7 +176,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:44
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
@@ -203,11 +206,11 @@ entry:
; GCN: s_add_u32 s32, s32, 0x400
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8
; GCN-NEXT: s_mov_b64 exec
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v34, s33, 0
; GCN-DAG: v_writelane_b32 v34, s34, 1
@@ -221,10 +224,10 @@ entry:
; GCN-DAG: v_readlane_b32 s33, v34, 0
; GCN-DAG: v_readlane_b32 s34, v34, 1
-; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8
; GCN-NEXT: s_mov_b64 exec
; GCN: s_sub_u32 s32, s32, 0x400
@@ -258,7 +261,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
; GCN-NOT: s33
; GCN: s_setpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index 4288da70582..10c9f23df71 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -4,13 +4,13 @@
; storeRegToStackSlot.
; GCN-LABEL: {{^}}spill_csr_s5_copy:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN: v_writelane_b32 v32, s5, 2
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s5, v32, 2
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN: s_setpc_b64
define void @spill_csr_s5_copy() #0 {
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 972d4e6692b..90f41093a9f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -82,6 +82,55 @@ entry:
ret void
}
+; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
+define void @test_sgpr_offset_function_scavenge_fail() #2 {
+entry:
+ ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
+ ; fit in the instruction, and has to live in the SGPR offset.
+ %alloca = alloca i8, i32 4096, align 4, addrspace(5)
+ %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
+
+ %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+
+ %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
+ %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
+ %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
+ %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
+ %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
+ %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
+ %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
+ %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
+ %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
+
+ ; 0x40000 / 64 = 4096 (for wave64)
+ %a = load volatile i32, i32 addrspace(5)* %aptr
+
+ ; CHECK: s_add_u32 s32, s32, 0x40000
+ ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
+ ; CHECK: s_sub_u32 s32, s32, 0x40000
+ call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
+
+ %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
+ %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
+ %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
+ %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
+ %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
+ %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
+ %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
+ %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
+ %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
+
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
+
+ ; CHECK: s_add_u32 s32, s32, 0x40000
+ ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
+ ; CHECK: s_sub_u32 s32, s32, 0x40000
+
+ ; Force %a to spill with no free SGPRs
+ call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
+ ret void
+}
+
; CHECK-LABEL: test_sgpr_offset_subregs_kernel
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
entry:
@@ -145,7 +194,7 @@ define void @test_inst_offset_function() {
entry:
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
; the instruction offset field.
- %alloca = alloca i8, i32 4088, align 4, addrspace(5)
+ %alloca = alloca i8, i32 4092, align 4, addrspace(5)
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
@@ -166,7 +215,7 @@ define void @test_sgpr_offset_function() {
entry:
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
; fit in the instruction, and has to live in the SGPR offset.
- %alloca = alloca i8, i32 4092, align 4, addrspace(5)
+ %alloca = alloca i8, i32 4096, align 4, addrspace(5)
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
@@ -190,7 +239,7 @@ entry:
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
; the instruction offset field.
- %alloca = alloca i8, i32 4084, align 4, addrspace(5)
+ %alloca = alloca i8, i32 4088, align 4, addrspace(5)
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
@@ -218,7 +267,7 @@ entry:
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
; in the SGPR offset.
- %alloca = alloca i8, i32 4088, align 4, addrspace(5)
+ %alloca = alloca i8, i32 4092, align 4, addrspace(5)
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
@@ -244,3 +293,4 @@ entry:
attributes #0 = { nounwind }
attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 91453d73c9e..b84fde71ef5 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -10,8 +10,9 @@
; GCN-LABEL: {{^}}needs_align16_default_stack_align:
; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4
-; GCN-NEXT: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]]
-; GCN: v_add_u32_e64 [[FI:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 16, [[FRAMEDIFF]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0
+; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]]
+; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]]
; GCN-NOT: s32
@@ -126,10 +127,10 @@ define amdgpu_kernel void @kernel_call_align4_from_5() {
; GCN-LABEL: {{^}}default_realign_align128:
; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000
-; GCN-NEXT: s_add_u32 s32, s32, 0x6000
+; GCN-NEXT: s_add_u32 s32, s32, 0x4000
; GCN-NOT: s5
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:128
-; GCN: s_sub_u32 s32, s32, 0x6000
+; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}}
+; GCN: s_sub_u32 s32, s32, 0x4000
define void @default_realign_align128(i32 %idx) #0 {
%alloca.align = alloca i32, align 128, addrspace(5)
store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128
@@ -138,7 +139,7 @@ define void @default_realign_align128(i32 %idx) #0 {
; GCN-LABEL: {{^}}disable_realign_align128:
; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN-NOT: s32
define void @disable_realign_align128(i32 %idx) #3 {
%alloca.align = alloca i32, align 128, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
index 2f9d0616fa6..395df1df7fd 100644
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -492,7 +492,7 @@ define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32
entry:
%value = bitcast i32 %arg to <2 x i16>
%hi = extractelement <2 x i16> %value, i32 1
- %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045
+ %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047
store i16 %hi, i16 addrspace(5)* %gep
ret void
}
@@ -644,7 +644,7 @@ entry:
store volatile i32 123, i32 addrspace(5)* %bc
%value = bitcast i32 %arg to <2 x i16>
%hi = extractelement <2 x i16> %value, i32 1
- %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
store i16 %hi, i16 addrspace(5)* %gep
ret void
}
@@ -661,7 +661,7 @@ entry:
store volatile i32 123, i32 addrspace(5)* %bc
%value = bitcast i32 %arg to <2 x i16>
%hi = extractelement <2 x i16> %value, i32 1
- %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
%trunc = trunc i16 %hi to i8
store i8 %trunc, i8 addrspace(5)* %gep
ret void
OpenPOWER on IntegriCloud