summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp45
-rw-r--r--clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp35
-rw-r--r--clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp14
3 files changed, 33 insertions, 61 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index a7b3c16e1f0..b055132ef01 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -189,13 +189,6 @@ enum MachineConfiguration : unsigned {
SharedMemorySize = 128,
};
-enum NamedBarrier : unsigned {
- /// Synchronize on this barrier #ID using a named barrier primitive.
- /// Only the subset of active threads in a parallel region arrive at the
- /// barrier.
- NB_Parallel = 1,
-};
-
static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
RefExpr = RefExpr->IgnoreParens();
if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
@@ -655,26 +648,9 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
CGF.EmitRuntimeCall(F);
}
-/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
-/// a CTA.
-static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
- llvm::Value *NumThreads) {
- CGBuilderTy &Bld = CGF.Builder;
- llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
- llvm::Function *F = llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier);
- F->addFnAttr(llvm::Attribute::Convergent);
- CGF.EmitRuntimeCall(F, Args);
-}
-
/// Synchronize all GPU threads in a block.
static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
-/// Synchronize worker threads in a parallel region.
-static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
- return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
-}
-
/// Get the value of the thread_limit clause in the teams directive.
/// For the 'generic' execution mode, the runtime encodes thread_limit in
/// the launch parameters, always starting thread_limit+warpSize threads per
@@ -3272,14 +3248,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
CGF.EmitBlock(MergeBB);
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
-
- llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
- NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
- // named_barrier_sync(ParallelBarrierID, num_active_threads)
- syncParallelThreads(CGF, NumActiveThreads);
+ // kmpc_barrier.
+ CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
+ /*EmitChecks=*/false,
+ /*ForceSimpleCall=*/true);
//
// Warp 0 copies reduce element from transfer medium.
@@ -3288,6 +3260,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
+ Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
+ llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
+ AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
+
// Up to 32 threads in warp 0 are active.
llvm::Value *IsActiveThread =
Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
@@ -3329,7 +3305,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
// While warp 0 copies values from transfer medium, all other warps must
// wait.
- syncParallelThreads(CGF, NumActiveThreads);
+ // kmpc_barrier.
+ CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
+ /*EmitChecks=*/false,
+ /*ForceSimpleCall=*/true);
if (NumIters > 1) {
Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 965ff9e4fbb..dd93b0c1b9e 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -209,9 +209,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -231,7 +230,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
// CHECK: br label
@@ -446,9 +445,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -467,7 +465,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -486,9 +484,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -507,7 +504,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: ret
@@ -761,9 +758,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -782,7 +778,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -802,9 +798,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -824,7 +819,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: ret
#endif
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index 97a128866a4..65c147bd929 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -412,9 +412,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -433,7 +432,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -453,9 +452,8 @@ int bar(int n){
//
// Barrier after copy to shared memory storage medium.
// CHECK: [[COPY_CONT]]
- // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
- // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
//
// Read into warp 0.
// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
@@ -475,7 +473,7 @@ int bar(int n){
// CHECK: br label {{%?}}[[READ_CONT]]
//
// CHECK: [[READ_CONT]]
- // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: ret
#endif
OpenPOWER on IntegriCloud