diff options
author | Yaxun Liu <Yaxun.Liu@amd.com> | 2018-04-25 01:10:37 +0000 |
---|---|---|
committer | Yaxun Liu <Yaxun.Liu@amd.com> | 2018-04-25 01:10:37 +0000 |
commit | 887c569bcb83115fce7ee768d92c93010fe49b47 (patch) | |
tree | b625a958bf24d8b4c60b58208fadace4316d4998 /clang/test/CodeGenCUDA | |
parent | 7282d320b7c9729cd91c5aa23d39629577e92c65 (diff) | |
download | bcm5719-llvm-887c569bcb83115fce7ee768d92c93010fe49b47.tar.gz bcm5719-llvm-887c569bcb83115fce7ee768d92c93010fe49b47.zip |
[HIP] Add hip input kind and codegen for kernel launching
HIP is a language similar to CUDA (https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md ).
The language syntax is very similar, which allows a hip program to be compiled as a CUDA program by Clang. The main difference
is the host API. HIP has a set of vendor neutral host API which can be implemented on different platforms. Currently there is open source
implementation of HIP runtime on amdgpu target (https://github.com/ROCm-Developer-Tools/HIP).
This patch adds support of input kind and language standard hip.
When hip file is compiled, both LangOpts.CUDA and LangOpts.HIP is turned on. This allows compilation of hip program as CUDA
in most cases and only special handling of hip program is needed LangOpts.HIP is checked.
This patch also adds support of kernel launching of HIP program using HIP host API.
When -x hip is not specified, there is no behaviour change for CUDA.
Patch by Greg Rodgers.
Revised and lit test added by Yaxun Liu.
Differential Revision: https://reviews.llvm.org/D44984
llvm-svn: 330790
Diffstat (limited to 'clang/test/CodeGenCUDA')
-rw-r--r-- | clang/test/CodeGenCUDA/Inputs/cuda.h | 5 | ||||
-rw-r--r-- | clang/test/CodeGenCUDA/device-stub.cu | 101 | ||||
-rw-r--r-- | clang/test/CodeGenCUDA/kernel-call.cu | 13 |
3 files changed, 74 insertions, 45 deletions
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h index 9b9f43a1aaa..3adbdc5b6d1 100644 --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -16,7 +16,12 @@ struct dim3 { typedef struct cudaStream *cudaStream_t; +#ifdef __HIP__ +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + cudaStream_t stream = 0); +#else int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, cudaStream_t stream = 0); +#endif extern "C" __device__ int printf(const char*, ...); diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu index 58ae9c5d2ec..8339d872ad9 100644 --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -1,16 +1,28 @@ // RUN: echo "GPU binary would be here" > %t // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck %s --check-prefixes=ALL,NORDC +// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ // RUN: | FileCheck %s -check-prefix=NOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck %s --check-prefixes=ALL,RDC +// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \ // RUN: | FileCheck %s -check-prefix=NOGPUBIN +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-include-gpubinary %t -o - -x hip\ +// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,HIP +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \ +// RUN: | FileCheck %s -check-prefix=NOGLOBALS +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \ +// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ +// RUN: | FileCheck %s -check-prefix=NOGPUBIN + #include "Inputs/cuda.h" #ifndef NOGLOBALS @@ -56,80 +68,83 @@ void use_pointers() { // NORDC-SAME: section ".nv_fatbin", align 8 // RDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary -// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } +// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant +// CUDA-SAME: { i32, i32, i8*, i8* } +// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant +// HIP-SAME: { i32, i32, i8*, i8* } // ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null } // ALL-SAME: section ".nvFatBinSegment" // * variable to save GPU binary handle after initialization -// NORDC: @__cuda_gpubin_handle = internal global i8** null +// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null // * constant unnamed string with NVModuleID // RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant // RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 // * Make sure our constructor was added to global ctor list. -// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor +// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor // * In separate mode we also register a destructor. -// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor +// NORDC: @llvm.global_dtors = appending global {{.*}}@__[[PREFIX]]_module_dtor // * Alias to global symbol containing the NVModuleID. // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* } -// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper +// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper // Test that we build the correct number of calls to cudaSetupArgument followed // by a call to cudaLaunch. // ALL: define{{.*}}kernelfunc -// ALL: call{{.*}}cudaSetupArgument -// ALL: call{{.*}}cudaSetupArgument -// ALL: call{{.*}}cudaSetupArgument -// ALL: call{{.*}}cudaLaunch +// ALL: call{{.*}}[[PREFIX]]SetupArgument +// ALL: call{{.*}}[[PREFIX]]SetupArgument +// ALL: call{{.*}}[[PREFIX]]SetupArgument +// ALL: call{{.*}}[[PREFIX]]Launch __global__ void kernelfunc(int i, int j, int k) {} // Test that we've built correct kernel launch sequence. // ALL: define{{.*}}hostfunc -// ALL: call{{.*}}cudaConfigureCall +// ALL: call{{.*}}[[PREFIX]]ConfigureCall // ALL: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } #endif // Test that we've built a function to register kernels and global vars. -// ALL: define internal void @__cuda_register_globals -// ALL: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc -// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 -// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 -// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 -// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// ALL: define internal void @__[[PREFIX]]_register_globals +// ALL: call{{.*}}[[PREFIX]]RegisterFunction(i8** %0, {{.*}}kernelfunc +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 // ALL: ret void // Test that we've built a constructor. -// ALL: define internal void @__cuda_module_ctor - -// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) -// NORDC: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper -// .. stores return value in __cuda_gpubin_handle -// NORDC-NEXT: store{{.*}}__cuda_gpubin_handle -// .. and then calls __cuda_register_globals -// NORDC-NEXT: call void @__cuda_register_globals - -// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID% -// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]]( -// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper +// ALL: define internal void @__[[PREFIX]]_module_ctor + +// In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper) +// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper +// .. stores return value in __[[PREFIX]]_gpubin_handle +// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle +// .. and then calls __[[PREFIX]]_register_globals +// NORDC-NEXT: call void @__[[PREFIX]]_register_globals + +// With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID% +// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]]( +// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper // RDC-SAME: [[MODULE_ID_GLOBAL]] // Test that we've created destructor. -// NORDC: define internal void @__cuda_module_dtor -// NORDC: load{{.*}}__cuda_gpubin_handle -// NORDC-NEXT: call void @__cudaUnregisterFatBinary +// NORDC: define internal void @__[[PREFIX]]_module_dtor +// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle +// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary -// There should be no __cuda_register_globals if we have no +// There should be no __[[PREFIX]]_register_globals if we have no // device-side globals, but we still need to register GPU binary. // Skip GPU binary string first. // NOGLOBALS: @0 = private unnamed_addr constant{{.*}} -// NOGLOBALS-NOT: define internal void @__cuda_register_globals -// NOGLOBALS: define internal void @__cuda_module_ctor -// NOGLOBALS: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper -// NOGLOBALS-NOT: call void @__cuda_register_globals -// NOGLOBALS: define internal void @__cuda_module_dtor -// NOGLOBALS: call void @__cudaUnregisterFatBinary +// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals +// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor +// NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper +// NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals +// NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor +// NOGLOBALS: call void @__[[PREFIX]]UnregisterFatBinary // There should be no constructors/destructors if we have no GPU binary. -// NOGPUBIN-NOT: define internal void @__cuda_register_globals -// NOGPUBIN-NOT: define internal void @__cuda_module_ctor -// NOGPUBIN-NOT: define internal void @__cuda_module_dtor +// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_register_globals +// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_ctor +// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_dtor diff --git a/clang/test/CodeGenCUDA/kernel-call.cu b/clang/test/CodeGenCUDA/kernel-call.cu index 9b849db908f..c6a4be44275 100644 --- a/clang/test/CodeGenCUDA/kernel-call.cu +++ b/clang/test/CodeGenCUDA/kernel-call.cu @@ -1,11 +1,20 @@ -// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK +// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK + #include "Inputs/cuda.h" +// CHECK-LABEL: define void @_Z2g1i(i32 %x) +// HIP: call{{.*}}hipSetupArgument +// HIP: call{{.*}}hipLaunchByPtr +// CUDA: call{{.*}}cudaSetupArgument +// CUDA: call{{.*}}cudaLaunch __global__ void g1(int x) {} +// CHECK-LABEL: define i32 @main int main(void) { - // CHECK: call{{.*}}cudaConfigureCall + // HIP: call{{.*}}hipConfigureCall + // CUDA: call{{.*}}cudaConfigureCall // CHECK: icmp // CHECK: br // CHECK: call{{.*}}g1 |