summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp5
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll53
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll48
5 files changed, 111 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 6f50fca8831..bcc0e77a545 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -20,6 +20,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
/// local memory space.
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
+protected:
uint64_t KernArgSize;
unsigned MaxKernArgAlign;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index a52b1137203..97fc6493b95 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -414,12 +414,16 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
unsigned SISubtarget::getKernArgSegmentSize(const Function &F,
unsigned ExplicitArgBytes) const {
+ uint64_t TotalSize = ExplicitArgBytes;
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
- if (ImplicitBytes == 0)
- return ExplicitArgBytes;
- unsigned Alignment = getAlignmentForImplicitArgPtr();
- return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+ if (ImplicitBytes != 0) {
+ unsigned Alignment = getAlignmentForImplicitArgPtr();
+ TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+ }
+
+ // Being able to dereference past the end is useful for emitting scalar loads.
+ return alignTo(TotalSize, 4);
}
unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 495a8534865..61b6cb33fd1 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -71,8 +71,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
ImplicitArgPtr = true;
} else {
- if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
+ if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
KernargSegmentPtr = true;
+ assert(MaxKernArgAlign == 0);
+ MaxKernArgAlign = ST.getAlignmentForImplicitArgPtr();
+ }
}
CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 8e9abb9de8b..f51366f2665 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -5,6 +5,7 @@
; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
; FUNC-LABEL: {{^}}i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
@@ -25,6 +26,7 @@ entry:
}
; FUNC-LABEL: {{^}}i8_zext_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
@@ -44,6 +46,7 @@ entry:
}
; FUNC-LABEL: {{^}}i8_sext_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
@@ -63,7 +66,9 @@ entry:
}
; FUNC-LABEL: {{^}}i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
@@ -83,7 +88,9 @@ entry:
}
; FUNC-LABEL: {{^}}i16_zext_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -102,7 +109,9 @@ entry:
}
; FUNC-LABEL: {{^}}i16_sext_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -121,7 +130,9 @@ entry:
}
; FUNC-LABEL: {{^}}i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
@@ -133,6 +144,7 @@ entry:
}
; FUNC-LABEL: {{^}}f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
@@ -145,7 +157,9 @@ entry:
}
; FUNC-LABEL: {{^}}v2i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: VTX_READ_8
; EG: VTX_READ_8
; MESA-GCN: buffer_load_ubyte
@@ -159,7 +173,9 @@ entry:
}
; FUNC-LABEL: {{^}}v2i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG: VTX_READ_16
; EG: VTX_READ_16
@@ -174,7 +190,9 @@ entry:
}
; FUNC-LABEL: {{^}}v2i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
+
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
@@ -187,7 +205,9 @@ entry:
}
; FUNC-LABEL: {{^}}v2f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
+
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
@@ -200,7 +220,9 @@ entry:
}
; FUNC-LABEL: {{^}}v3i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
+
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
@@ -217,7 +239,9 @@ entry:
}
; FUNC-LABEL: {{^}}v3i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
+
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
@@ -233,6 +257,7 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}v3i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -247,6 +272,7 @@ entry:
}
; FUNC-LABEL: {{^}}v3f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -261,6 +287,7 @@ entry:
}
; FUNC-LABEL: {{^}}v4i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_8
; EG: VTX_READ_8
@@ -281,6 +308,7 @@ entry:
}
; FUNC-LABEL: {{^}}v4i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_16
; EG: VTX_READ_16
@@ -301,6 +329,7 @@ entry:
}
; FUNC-LABEL: {{^}}v4i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -317,6 +346,7 @@ entry:
}
; FUNC-LABEL: {{^}}v4f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -332,6 +362,7 @@ entry:
}
; FUNC-LABEL: {{^}}v8i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 16
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_8
; EG: VTX_READ_8
@@ -363,6 +394,7 @@ entry:
}
; FUNC-LABEL: {{^}}v8i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_16
; EG: VTX_READ_16
@@ -393,6 +425,7 @@ entry:
}
; FUNC-LABEL: {{^}}v8i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 64
; HSA-VI: kernarg_segment_alignment = 5
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
@@ -412,6 +445,7 @@ entry:
}
; FUNC-LABEL: {{^}}v8f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 64
; HSA-VI: kernarg_segment_alignment = 5
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
@@ -429,6 +463,7 @@ entry:
}
; FUNC-LABEL: {{^}}v16i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
; EG: VTX_READ_8
; EG: VTX_READ_8
@@ -485,6 +520,7 @@ entry:
}
; FUNC-LABEL: {{^}}v16i16_arg:
+; HSA-VI: kernarg_segment_byte_size = 64
; HSA-VI: kernarg_segment_alignment = 5
; EG: VTX_READ_16
; EG: VTX_READ_16
@@ -535,6 +571,7 @@ entry:
}
; FUNC-LABEL: {{^}}v16i32_arg:
+; HSA-VI: kernarg_segment_byte_size = 128
; HSA-VI: kernarg_segment_alignment = 6
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
@@ -562,6 +599,7 @@ entry:
}
; FUNC-LABEL: {{^}}v16f32_arg:
+; HSA-VI: kernarg_segment_byte_size = 128
; HSA-VI: kernarg_segment_alignment = 6
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
@@ -621,6 +659,9 @@ entry:
; }
; FUNC-LABEL: {{^}}i1_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
; SI: buffer_load_ubyte
; SI: v_and_b32_e32
; SI: buffer_store_byte
@@ -631,6 +672,9 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
}
; FUNC-LABEL: {{^}}i1_arg_zext_i32:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
; SI: buffer_load_ubyte
; SI: buffer_store_dword
; SI: s_endpgm
@@ -641,6 +685,9 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
}
; FUNC-LABEL: {{^}}i1_arg_zext_i64:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
; SI: buffer_load_ubyte
; SI: buffer_store_dwordx2
; SI: s_endpgm
@@ -651,6 +698,9 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
}
; FUNC-LABEL: {{^}}i1_arg_sext_i32:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
; SI: buffer_load_ubyte
; SI: buffer_store_dword
; SI: s_endpgm
@@ -661,6 +711,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
}
; FUNC-LABEL: {{^}}i1_arg_sext_i64:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
; SI: buffer_load_ubyte
; SI: v_bfe_i32
; SI: v_ashrrev_i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 772c155ea52..6c1bc9eaa76 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -4,6 +4,9 @@
; ALL-LABEL: {{^}}test:
; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: kernarg_segment_byte_size = 8
+; HSA: kernarg_segment_alignment = 4
+
; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
@@ -17,6 +20,10 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
}
; ALL-LABEL: {{^}}test_implicit:
+; HSA: kernarg_segment_byte_size = 8
+; OS-MESA3D: kernarg_segment_byte_size = 24
+; CO-V2: kernarg_segment_alignment = 4
+
; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
@@ -28,9 +35,12 @@ define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
ret void
}
-; ALL-LABEL: {{^}}test_implicit_alignment
-; HSA: kernarg_segment_byte_size = 10
+; ALL-LABEL: {{^}}test_implicit_alignment:
+; HSA: kernarg_segment_byte_size = 12
; OS-MESA3D: kernarg_segment_byte_size = 28
+; CO-V2: kernarg_segment_alignment = 4
+
+
; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc
; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4
; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3
@@ -48,6 +58,9 @@ define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x
; ALL-LABEL: {{^}}opencl_test_implicit_alignment
; HSA: kernarg_segment_byte_size = 64
; OS-MESA3D: kernarg_segment_byte_size = 28
+; CO-V2: kernarg_segment_alignment = 4
+
+
; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc
; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4
; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3
@@ -63,7 +76,11 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
}
; ALL-LABEL: {{^}}test_no_kernargs:
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
+; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
+; HSA: kernarg_segment_byte_size = 0
+; OS-MESA3D: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 32
+
; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
define amdgpu_kernel void @test_no_kernargs() #1 {
%kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
@@ -74,9 +91,34 @@ define amdgpu_kernel void @test_no_kernargs() #1 {
ret void
}
+; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs:
+; HSA: kernarg_segment_byte_size = 48
+; OS-MESA3d: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 4
+define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load volatile i32, i32 addrspace(4)* %arg.ptr
+ store volatile i32 %val, i32 addrspace(1)* null
+ ret void
+}
+
+; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs_round_up:
+; HSA: kernarg_segment_byte_size = 40
+; OS-MESA3D: kernarg_segment_byte_size = 16
+; CO-V2: kernarg_segment_alignment = 4
+define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_round_up() #3 {
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+ %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val = load volatile i32, i32 addrspace(4)* %arg.ptr
+ store volatile i32 %val, i32 addrspace(1)* null
+ ret void
+}
+
declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" }
+attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" }
OpenPOWER on IntegriCloud