summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp212
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h11
-rw-r--r--clang/test/OpenMP/nvptx_data_sharing.cpp57
-rw-r--r--clang/test/OpenMP/nvptx_parallel_codegen.cpp22
-rw-r--r--clang/test/OpenMP/nvptx_target_teams_codegen.cpp4
5 files changed, 39 insertions, 267 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 8ebfa3aa260..c0bf6e13b2a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -33,11 +33,11 @@ enum OpenMPRTLFunctionNVPTX {
/// \brief Call to void __kmpc_spmd_kernel_deinit();
OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
/// \brief Call to void __kmpc_kernel_prepare_parallel(void
- /// *outlined_function, void ***args, kmp_int32 nArgs, int16_t
+ /// *outlined_function, int16_t
/// IsOMPRuntimeInitialized);
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
- /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void
- /// ***args, int16_t IsOMPRuntimeInitialized);
+ /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function,
+ /// int16_t IsOMPRuntimeInitialized);
OMPRTL_NVPTX__kmpc_kernel_parallel,
/// \brief Call to void __kmpc_kernel_end_parallel();
OMPRTL_NVPTX__kmpc_kernel_end_parallel,
@@ -288,7 +288,6 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
EntryFunctionState EST;
WorkerFunctionState WST(CGM, D.getLocStart());
Work.clear();
- WrapperFunctionsMap.clear();
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
@@ -508,11 +507,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
- // Set up shared arguments
- Address SharedArgs =
- CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
// TODO: Optimize runtime initialization and pass in correct value.
- llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer(),
+ llvm::Value *Args[] = {WorkFn.getPointer(),
/*RequiresOMPRuntime=*/Bld.getInt16(1)};
llvm::Value *Ret = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
@@ -532,9 +528,6 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
// Signal start of parallel region.
CGF.EmitBlock(ExecuteBB);
- // Current context
- ASTContext &Ctx = CGF.getContext();
-
// Process work items: outlined parallel functions.
for (auto *W : Work) {
// Try to match this outlined function.
@@ -550,19 +543,14 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
// Execute this outlined function.
CGF.EmitBlock(ExecuteFNBB);
- // Insert call to work function via shared wrapper. The shared
- // wrapper takes exactly three arguments:
- // - the parallelism level;
- // - the master thread ID;
- // - the list of references to shared arguments.
- //
- // TODO: Assert that the function is a wrapper function.s
- Address Capture = CGF.EmitLoadOfPointer(SharedArgs,
- Ctx.getPointerType(
- Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());
- emitOutlinedFunctionCall(CGF, WST.Loc, W,
- {Bld.getInt16(/*ParallelLevel=*/0),
- getMasterThreadID(CGF), Capture.getPointer()});
+ // Insert call to work function.
+ // FIXME: Pass arguments to outlined function from master thread.
+ auto *Fn = cast<llvm::Function>(W);
+ Address ZeroAddr =
+ CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
+ llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
+ emitCall(CGF, WST.Loc, Fn, FnArgs);
// Go to end of parallel region.
CGF.EmitBranch(TerminateBB);
@@ -630,10 +618,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
}
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
/// Build void __kmpc_kernel_prepare_parallel(
- /// void *outlined_function, void ***args, kmp_int32 nArgs, int16_t
- /// IsOMPRuntimeInitialized);
+ /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
- CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty,
CGM.Int16Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
@@ -641,10 +627,9 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
break;
}
case OMPRTL_NVPTX__kmpc_kernel_parallel: {
- /// Build bool __kmpc_kernel_parallel(void **outlined_function, void
- /// ***args, int16_t IsOMPRuntimeInitialized);
- llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
- CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int16Ty};
+ /// Build bool __kmpc_kernel_parallel(void **outlined_function,
+ /// int16_t IsOMPRuntimeInitialized);
+ llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
llvm::FunctionType *FnTy =
llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
@@ -862,17 +847,8 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
-
- auto *OutlinedFun = cast<llvm::Function>(
- CGOpenMPRuntime::emitParallelOutlinedFunction(
- D, ThreadIDVar, InnermostKind, CodeGen));
- if (!isInSpmdExecutionMode()) {
- llvm::Function *WrapperFun =
- createDataSharingWrapper(OutlinedFun, D);
- WrapperFunctionsMap[OutlinedFun] = WrapperFun;
- }
-
- return OutlinedFun;
+ return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
+ InnermostKind, CodeGen);
}
llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
@@ -924,57 +900,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
- llvm::Function *WFn = WrapperFunctionsMap[Fn];
- assert(WFn && "Wrapper function does not exist!");
-
- // Force inline this outlined function at its call site.
- Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
- auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
- PrePostActionTy &) {
+ auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
CGBuilderTy &Bld = CGF.Builder;
- llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
-
- if (!CapturedVars.empty()) {
- // There's somehting to share, add the attribute
- CGF.CurFn->addFnAttr("has-nvptx-shared-depot");
- // Prepare for parallel region. Indicate the outlined function.
- Address SharedArgs =
- CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
- "shared_args");
- llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
- // TODO: Optimize runtime initialization and pass in correct value.
- llvm::Value *Args[] = {ID, SharedArgsPtr,
- Bld.getInt32(CapturedVars.size()),
- /*RequiresOMPRuntime=*/Bld.getInt16(1)};
-
- CGF.EmitRuntimeCall(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
- Args);
-
- unsigned Idx = 0;
- ASTContext &Ctx = CGF.getContext();
- for (llvm::Value *V : CapturedVars) {
- Address Dst = Bld.CreateConstInBoundsGEP(
- CGF.EmitLoadOfPointer(SharedArgs,
- Ctx.getPointerType(
- Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()),
- Idx, CGF.getPointerSize());
- llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
- CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
- Ctx.getPointerType(Ctx.VoidPtrTy));
- Idx++;
- }
- } else {
- // TODO: Optimize runtime initialization and pass in correct value.
- llvm::Value *Args[] = {
- ID, llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),
- /*nArgs=*/Bld.getInt32(0), /*RequiresOMPRuntime=*/Bld.getInt16(1)};
- CGF.EmitRuntimeCall(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
- Args);
- }
+ // TODO: Optimize runtime initialization.
+ llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy),
+ /*RequiresOMPRuntime=*/Bld.getInt16(1)};
+ CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
+ Args);
// Activate workers. This barrier is used by the master to signal
// work for the workers.
@@ -989,7 +924,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
syncCTAThreads(CGF);
// Remember for post-processing in worker loop.
- Work.emplace_back(WFn);
+ Work.emplace_back(Fn);
};
auto *RTLoc = emitUpdateLocation(CGF, Loc);
@@ -2408,98 +2343,3 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
}
CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
}
-
-/// Emit function which wraps the outline parallel region
-/// and controls the arguments which are passed to this function.
-/// The wrapper ensures that the outlined function is called
-/// with the correct arguments when data is shared.
-llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
- llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
- ASTContext &Ctx = CGM.getContext();
- const CapturedStmt &CS = *D.getCapturedStmt(OMPD_parallel);
-
- // Create a function that takes as argument the source thread.
- FunctionArgList WrapperArgs;
- QualType Int16QTy =
- Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
- QualType Int32QTy =
- Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
- QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);
- QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);
- ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
- /*Id=*/nullptr, Int16QTy,
- ImplicitParamDecl::Other);
- ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
- /*Id=*/nullptr, Int32QTy,
- ImplicitParamDecl::Other);
- ImplicitParamDecl SharedArgsList(Ctx, /*DC=*/nullptr, D.getLocStart(),
- /*Id=*/nullptr, VoidPtrPtrQTy,
- ImplicitParamDecl::Other);
- WrapperArgs.emplace_back(&ParallelLevelArg);
- WrapperArgs.emplace_back(&WrapperArg);
- WrapperArgs.emplace_back(&SharedArgsList);
-
- auto &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
-
- auto *Fn = llvm::Function::Create(
- CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
- OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
-
- CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
- CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
- D.getLocStart(), D.getLocStart());
-
- const auto *RD = CS.getCapturedRecordDecl();
- auto CurField = RD->field_begin();
-
- // Get the array of arguments.
- SmallVector<llvm::Value *, 8> Args;
-
- // TODO: suppport SIMD and pass actual values
- Args.emplace_back(llvm::ConstantPointerNull::get(
- CGM.Int32Ty->getPointerTo()));
- Args.emplace_back(llvm::ConstantPointerNull::get(
- CGM.Int32Ty->getPointerTo()));
-
- CGBuilderTy &Bld = CGF.Builder;
- auto CI = CS.capture_begin();
-
- // Load the start of the array
- auto SharedArgs =
- CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList),
- VoidPtrPtrQTy->castAs<PointerType>());
-
- // For each captured variable
- for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {
- // Name of captured variable
- StringRef Name;
- if (CI->capturesThis())
- Name = "this";
- else
- Name = CI->getCapturedVar()->getName();
-
- // We retrieve the CLANG type of the argument. We use it to create
- // an alloca which will give us the LLVM type.
- QualType ElemTy = CurField->getType();
- // If this is a capture by copy the element type has to be the pointer to
- // the data.
- if (CI->capturesVariableByCopy())
- ElemTy = Ctx.getPointerType(ElemTy);
-
- // Get shared address of the captured variable.
- Address ArgAddress = Bld.CreateConstInBoundsGEP(
- SharedArgs, I, CGF.getPointerSize());
- Address TypedArgAddress = Bld.CreateBitCast(
- ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy)));
- llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress,
- /*Volatile=*/false, Int32PtrQTy, SourceLocation());
- Args.emplace_back(Arg);
- }
-
- emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
- CGF.FinishFunction();
- return Fn;
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 6ee2b7afdfe..6723c3ac4bb 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -306,17 +306,6 @@ private:
// target region and used by containing directives such as 'parallel'
// to emit optimized code.
ExecutionMode CurrentExecutionMode;
-
- /// Map between an outlined function and its wrapper.
- llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
-
- /// Emit function which wraps the outline parallel region
- /// and controls the parameters which are passed to this function.
- /// The wrapper ensures that the outlined function is called
- /// with the correct arguments when data is shared.
- llvm::Function *
- createDataSharingWrapper(llvm::Function *OutlinedParallelFn,
- const OMPExecutableDirective &D);
};
} // CodeGen namespace.
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
deleted file mode 100644
index 53bac3ced96..00000000000
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Test device data sharing codegen.
-///==========================================================================///
-
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1
-
-// expected-no-diagnostics
-
-#ifndef HEADER
-#define HEADER
-
-void test_ds(){
- #pragma omp target
- {
- int a = 10;
- #pragma omp parallel
- {
- a = 1000;
- }
- }
-}
-
-/// ========= In the worker function ========= ///
-
-// CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker() [[ATTR1:#.*]] {
-// CK1: [[SHAREDARGS:%.+]] = alloca i8**
-// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]], i16 1)
-// CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]]
-// CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** [[SHARGSTMP]])
-
-/// ========= In the kernel function ========= ///
-
-// CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() [[ATTR2:#.*]] {
-// CK1: [[SHAREDARGS1:%.+]] = alloca i8**
-// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** [[SHAREDARGS1]], i32 1, i16 1)
-// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
-// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]]
-// CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8*
-// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]]
-
-/// ========= In the data sharing wrapper function ========= ///
-
-// CK1: {{.*}}define internal void @__omp_outlined___wrapper({{.*}}i8**) [[ATTR1]] {
-// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
-// CK1: store i8** %2, i8*** [[SHAREDARGS2]]
-// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]]
-// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]]
-// CK1: [[SHARGSTMP5:%.+]] = bitcast i8** [[SHARGSTMP4]] to i32**
-// CK1: [[SHARGSTMP6:%.+]] = load i32*, i32** [[SHARGSTMP5]]
-// CK1: call void @__omp_outlined__({{.*}}, i32* [[SHARGSTMP6]])
-
-/// ========= Attributes ========= ///
-
-// CK1-NOT: attributes [[ATTR1]] = { {{.*}}"has-nvptx-shared-depot"{{.*}} }
-// CK1: attributes [[ATTR2]] = { {{.*}}"has-nvptx-shared-depot"{{.*}} }
-
-#endif
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index ba889bf4975..ea2773ce46a 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -78,7 +78,7 @@ int bar(int n){
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
- // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+ // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -92,20 +92,20 @@ int bar(int n){
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
- // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+ // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
//
// CHECK: [[EXEC_PFN1]]
- // CHECK: call void [[PARALLEL_FN1]]_wrapper(
+ // CHECK: call void [[PARALLEL_FN1]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT1]]
// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
- // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+ // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
//
// CHECK: [[EXEC_PFN2]]
- // CHECK: call void [[PARALLEL_FN2]]_wrapper(
+ // CHECK: call void [[PARALLEL_FN2]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT2]]
@@ -152,13 +152,13 @@ int bar(int n){
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
- // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*),
+ // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @__kmpc_serialized_parallel(
// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
// CHECK: call void @__kmpc_end_serialized_parallel(
- // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*),
+ // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK-64-DAG: load i32, i32* [[REF_A]]
@@ -203,7 +203,7 @@ int bar(int n){
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
- // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+ // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -217,11 +217,11 @@ int bar(int n){
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
- // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN4:@.+]]_wrapper to i8*)
+ // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*)
// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
//
// CHECK: [[EXEC_PFN]]
- // CHECK: call void [[PARALLEL_FN4]]_wrapper(
+ // CHECK: call void [[PARALLEL_FN4]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT]]
@@ -283,7 +283,7 @@ int bar(int n){
// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
//
// CHECK: [[IF_THEN]]
- // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN4]]_wrapper to i8*),
+ // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[IF_END:.+]]
diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
index ff5967b5612..d547c167144 100644
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@ int bar(int n){
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
- // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+ // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -146,7 +146,7 @@ int bar(int n){
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
- // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+ // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
OpenPOWER on IntegriCloud