summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/CodeGen/CGCUDANV.cpp110
-rw-r--r--clang/lib/Headers/__clang_cuda_runtime_wrapper.h10
-rw-r--r--clang/lib/Sema/SemaCUDA.cpp19
-rw-r--r--clang/lib/Sema/SemaDecl.cpp7
4 files changed, 135 insertions, 11 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 0678f14e7db..9aaa5f76c7a 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,6 +15,8 @@
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "clang/AST/Decl.h"
+#include "clang/Basic/Cuda.h"
+#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -102,7 +104,8 @@ private:
return DummyFunc;
}
- void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
+ void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
+ void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
public:
CGNVCUDARuntime(CodeGenModule &CGM);
@@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
FunctionArgList &Args) {
EmittedKernels.push_back(CGF.CurFn);
- emitDeviceStubBody(CGF, Args);
+ if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
+ CudaFeature::CUDA_USES_NEW_LAUNCH))
+ emitDeviceStubBodyNew(CGF, Args);
+ else
+ emitDeviceStubBodyLegacy(CGF, Args);
}
-void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
- FunctionArgList &Args) {
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ // Build the shadow stack entry at the very start of the function.
+
+ // Calculate amount of space we will need for all arguments. If we have no
+ // args, allocate a single pointer so we still have a valid pointer to the
+ // argument array that we can pass to runtime, even if it will be unused.
+ Address KernelArgs = CGF.CreateTempAlloca(
+ VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+ llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
+ // Store pointers to the arguments in a locally allocated launch_args.
+ for (unsigned i = 0; i < Args.size(); ++i) {
+ llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
+ llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
+ CGF.Builder.CreateDefaultAlignedStore(
+ VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
+ }
+
+ llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+
+ // Lookup cudaLaunchKernel function.
+ // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ // void **args, size_t sharedMem,
+ // cudaStream_t stream);
+ TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
+ DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
+ IdentifierInfo &cudaLaunchKernelII =
+ CGM.getContext().Idents.get("cudaLaunchKernel");
+ FunctionDecl *cudaLaunchKernelFD = nullptr;
+ for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
+ if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
+ cudaLaunchKernelFD = FD;
+ }
+
+ if (cudaLaunchKernelFD == nullptr) {
+ CGM.Error(CGF.CurFuncDecl->getLocation(),
+ "Can't find declaration for cudaLaunchKernel()");
+ return;
+ }
+ // Create temporary dim3 grid_dim, block_dim.
+ ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
+ QualType Dim3Ty = GridDimParam->getType();
+ Address GridDim =
+ CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
+ Address BlockDim =
+ CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
+ Address ShmemSize =
+ CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
+ Address Stream =
+ CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
+ llvm::Constant *cudaPopConfigFn = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(IntTy,
+ {/*gridDim=*/GridDim.getType(),
+ /*blockDim=*/BlockDim.getType(),
+ /*ShmemSize=*/ShmemSize.getType(),
+ /*Stream=*/Stream.getType()},
+ /*isVarArg=*/false),
+ "__cudaPopCallConfiguration");
+
+ CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
+ {GridDim.getPointer(), BlockDim.getPointer(),
+ ShmemSize.getPointer(), Stream.getPointer()});
+
+ // Emit the call to cudaLaunch
+ llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+ CallArgList LaunchKernelArgs;
+ LaunchKernelArgs.add(RValue::get(Kernel),
+ cudaLaunchKernelFD->getParamDecl(0)->getType());
+ LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
+ LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
+ LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
+ cudaLaunchKernelFD->getParamDecl(3)->getType());
+ LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
+ cudaLaunchKernelFD->getParamDecl(4)->getType());
+ LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
+ cudaLaunchKernelFD->getParamDecl(5)->getType());
+
+ QualType QT = cudaLaunchKernelFD->getType();
+ QualType CQT = QT.getCanonicalType();
+ llvm::Type *Ty = CGM.getTypes().ConvertFunctionType(CQT, cudaLaunchKernelFD);
+ llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
+
+ const CGFunctionInfo &FI =
+ CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
+ llvm::Constant *cudaLaunchKernelFn =
+ CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel");
+ CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
+ LaunchKernelArgs);
+ CGF.EmitBranch(EndBlock);
+
+ CGF.EmitBlock(EndBlock);
+}
+
+void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
// Emit a call to cudaSetupArgument for each arg in Args.
llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index f05c0454a88..4b3672463e6 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -426,5 +426,15 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
#pragma pop_macro("__USE_FAST_MATH__")
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
+// CUDA runtime uses this undocumented function to access kernel launch
+// configuration. The declaration is in crt/device_functions.h but that file
+// includes a lot of other stuff we don't want. Instead, we'll provide our own
+// declaration for it here.
+#if CUDA_VERSION >= 9020
+extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+ size_t sharedMem = 0,
+ void *stream = 0);
+#endif
+
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index ec926eaf851..43cc14dfc97 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -13,6 +13,7 @@
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
#include "clang/AST/ExprCXX.h"
+#include "clang/Basic/Cuda.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Sema/Lookup.h"
#include "clang/Sema/Sema.h"
@@ -41,9 +42,8 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
SourceLocation GGGLoc) {
FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
if (!ConfigDecl)
- return ExprError(
- Diag(LLLLoc, diag::err_undeclared_var_use)
- << (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getCudaConfigureFuncName());
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (Context)
@@ -957,3 +957,16 @@ void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
}
+
+std::string Sema::getCudaConfigureFuncName() const {
+ if (getLangOpts().HIP)
+ return "hipConfigureCall";
+
+ // New CUDA kernel launch sequence.
+ if (CudaFeatureEnabled(Context.getTargetInfo().getSDKVersion(),
+ CudaFeature::CUDA_USES_NEW_LAUNCH))
+ return "__cudaPushCallConfiguration";
+
+ // Legacy CUDA kernel configuration call
+ return "cudaConfigureCall";
+}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 38a73302e8c..112184d87d9 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9146,13 +9146,12 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
if (getLangOpts().CUDA) {
IdentifierInfo *II = NewFD->getIdentifier();
- if (II &&
- II->isStr(getLangOpts().HIP ? "hipConfigureCall"
- : "cudaConfigureCall") &&
+ if (II && II->isStr(getCudaConfigureFuncName()) &&
!NewFD->isInvalidDecl() &&
NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return);
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << getCudaConfigureFuncName();
Context.setcudaConfigureCallDecl(NewFD);
}
OpenPOWER on IntegriCloud