diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-08-11 20:42:08 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-08-11 20:42:08 +0000 |
| commit | 71bcbd451ff7460f12c77b97ecdc29c7dbd19bf0 (patch) | |
| tree | 273dabb06f4e6d1ad833082b00f4703dde8b94b3 /llvm/test | |
| parent | 964e096345084b4ea1edba30c82f6f38d9c0884a (diff) | |
| download | bcm5719-llvm-71bcbd451ff7460f12c77b97ecdc29c7dbd19bf0.tar.gz bcm5719-llvm-71bcbd451ff7460f12c77b97ecdc29c7dbd19bf0.zip | |
AMDGPU: Start adding tail call support
Handle the sibling call cases.
llvm-svn: 310753
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/sibling-call.ll | 225 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll | 43 |
2 files changed, 268 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll new file mode 100644 index 00000000000..08c6dc9dbc1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -0,0 +1,225 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s + +; GCN-LABEL: {{^}}i32_fastcc_i32_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { + %add0 = add i32 %arg0, %arg1 + ret i32 %add0 +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: +define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: +; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: +define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret void +} + +; It doesn't make sense to do a tail from a kernel +; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: +;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + ret void +} + +; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: +; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 { + %arg1.load = load i32, i32* %arg1, align 4 + %add0 = add i32 %arg0, %arg1.load + ret i32 %add0 +} + +; Tail call disallowed with byval in parent. +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 +; GCN: s_swappc_b64 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32* byval %b.byval, i32 %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* %b.byval) + ret i32 %ret +} + +; Tail call disallowed with byval in parent, not callee. +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: +; GCN-NOT: v0 +; GCN-NOT: s32 +; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* inttoptr (i32 16 to i32*)) + ret i32 %ret +} + +; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: v_add_i32_e32 v0, vcc, v1, v0 +; GCN: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0 +; GCN: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0 +; GCN-NEXT: s_setpc_b64 +define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { + %val_firststack = extractvalue [32 x i32] %large, 30 + %val_laststack = extractvalue [32 x i32] %large, 31 + %add0 = add i32 %arg0, %arg1 + %add1 = add i32 %add0, %val_firststack + %add2 = add i32 %add1, %val_laststack + ret i32 %add2 +} + +; FIXME: Why load and store same location for stack args? +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: +; GCN: s_mov_b32 s5, s32 + +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill + +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 + +; GCN-NOT: s32 + +; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 + +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload + +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: +; GCN-DAG: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 + +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; If the callee requires more stack argument space than the caller, +; don't do a tail call. +; TODO: Do we really need this restriction? + +; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: +; GCN: s_swappc_b64 +; GCN: s_setpc_b64 +define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { +entry: + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) + ret i32 %ret +} + +; Have another non-tail in the function +; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v34, s33, 0 +; GCN-DAG: v_writelane_b32 v34, s34, 1 +; GCN-DAG: v_writelane_b32 v34, s35, 2 +; GCN-DAG: s_add_u32 s32, s32, 0x400 + +; GCN: s_getpc_b64 +; GCN: s_swappc_b64 + +; GCN: s_getpc_b64 s[6:7] +; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 + +; GCN-DAG: v_readlane_b32 s33, v34, 0 +; GCN-DAG: v_readlane_b32 s34, v34, 1 +; GCN-DAG: v_readlane_b32 s35, v34, 2 + +; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN: s_sub_u32 s32, s32, 0x400 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { +entry: + %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) + %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) + ret i32 %ret +} + +; Have stack object in caller and stack passed arguments. SP should be +; in same place at function exit. + +; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) + ret i32 %ret +} + +; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: +; GCN: s_mov_b32 s5, s32 +; GCN-NOT: s32 +; GCN: s_setpc_b64 s[6:7] +define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { +entry: + %alloca = alloca [16 x i32], align 4 + %gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5 + store volatile i32 9, i32* %gep + %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) + ret i32 %ret +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll b/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll new file mode 100644 index 00000000000..6131ab2ae43 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll @@ -0,0 +1,43 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s + +define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 { + store volatile i32 %a, i32* %p, align 4 + ret void +} + +; CHECK-LABEL: @func_caller( +; CHECK: tail call fastcc void @callee( +; CHECK-NEXT: ret void +; CHECK: ret void +define void @func_caller(i32* nocapture %p, i32 %a, i32 %b) #0 { +entry: + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %bb, label %ret + +bb: + tail call fastcc void @callee(i32* %p, i32 %a) + br label %ret + +ret: + ret void +} + +; CHECK-LABEL: @kernel_caller( +; CHECK: tail call fastcc void @callee( +; CHECK-NEXT: br label %ret + +; CHECK: ret void +define amdgpu_kernel void @kernel_caller(i32* nocapture %p, i32 %a, i32 %b) #0 { +entry: + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %bb, label %ret + +bb: + tail call fastcc void @callee(i32* %p, i32 %a) + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } |

