diff options
| -rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 45 | ||||
| -rw-r--r-- | clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp | 35 | ||||
| -rw-r--r-- | clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp | 14 |
3 files changed, 33 insertions, 61 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index a7b3c16e1f0..b055132ef01 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -189,13 +189,6 @@ enum MachineConfiguration : unsigned { SharedMemorySize = 128, }; -enum NamedBarrier : unsigned { - /// Synchronize on this barrier #ID using a named barrier primitive. - /// Only the subset of active threads in a parallel region arrive at the - /// barrier. - NB_Parallel = 1, -}; - static const ValueDecl *getPrivateItem(const Expr *RefExpr) { RefExpr = RefExpr->IgnoreParens(); if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { @@ -655,26 +648,9 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) { CGF.EmitRuntimeCall(F); } -/// Get barrier #ID to synchronize selected (multiple of warp size) threads in -/// a CTA. -static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, - llvm::Value *NumThreads) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; - llvm::Function *F = llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier); - F->addFnAttr(llvm::Attribute::Convergent); - CGF.EmitRuntimeCall(F, Args); -} - /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } -/// Synchronize worker threads in a parallel region. -static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) { - return getNVPTXBarrier(CGF, NB_Parallel, NumThreads); -} - /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -3272,14 +3248,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, CGF.EmitBlock(MergeBB); - Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); - llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( - AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); - - llvm::Value *NumActiveThreads = Bld.CreateNSWMul( - NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); - // named_barrier_sync(ParallelBarrierID, num_active_threads) - syncParallelThreads(CGF, NumActiveThreads); + // kmpc_barrier. + CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, + /*EmitChecks=*/false, + /*ForceSimpleCall=*/true); // // Warp 0 copies reduce element from transfer medium. @@ -3288,6 +3260,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); + Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); + llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( + AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); + // Up to 32 threads in warp 0 are active. llvm::Value *IsActiveThread = Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); @@ -3329,7 +3305,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, // While warp 0 copies values from transfer medium, all other warps must // wait. - syncParallelThreads(CGF, NumActiveThreads); + // kmpc_barrier. + CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, + /*EmitChecks=*/false, + /*ForceSimpleCall=*/true); if (NumIters > 1) { Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp index 965ff9e4fbb..dd93b0c1b9e 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -209,9 +209,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -231,7 +230,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], // CHECK: br label @@ -446,9 +445,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -467,7 +465,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] // @@ -486,9 +484,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -507,7 +504,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: ret @@ -761,9 +758,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -782,7 +778,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] // @@ -802,9 +798,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -824,7 +819,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: ret #endif diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index 97a128866a4..65c147bd929 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -412,9 +412,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -433,7 +432,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] // @@ -453,9 +452,8 @@ int bar(int n){ // // Barrier after copy to shared memory storage medium. // CHECK: [[COPY_CONT]] - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* // // Read into warp 0. // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] @@ -475,7 +473,7 @@ int bar(int n){ // CHECK: br label {{%?}}[[READ_CONT]] // // CHECK: [[READ_CONT]] - // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ // CHECK: ret #endif |

