summaryrefslogtreecommitdiffstats
path: root/clang/lib/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/CodeGen')
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp407
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h4
2 files changed, 246 insertions, 165 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 8a984548927..da1c0d2a3a9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -188,6 +188,28 @@ enum NamedBarrier : unsigned {
NB_Parallel = 1,
};
+static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
+ RefExpr = RefExpr->IgnoreParens();
+ if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
+ const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+ Base = TempASE->getBase()->IgnoreParenImpCasts();
+ RefExpr = Base;
+ } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
+ const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+ Base = TempOASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+ Base = TempASE->getBase()->IgnoreParenImpCasts();
+ RefExpr = Base;
+ }
+ RefExpr = RefExpr->IgnoreParenImpCasts();
+ if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
+ return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+ const auto *ME = cast<MemberExpr>(RefExpr);
+ return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
return P1.first > P2.first;
@@ -394,7 +416,10 @@ class CheckVarsEscapingDeclContext final
}
public:
- CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
+ CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
+ ArrayRef<const ValueDecl *> TeamsReductions)
+ : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
+ }
virtual ~CheckVarsEscapingDeclContext() = default;
void VisitDeclStmt(const DeclStmt *S) {
if (!S)
@@ -614,8 +639,10 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
/// Get barrier to synchronize all threads in a block.
static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
- CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
+ llvm::Function *F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
+ F->addFnAttr(llvm::Attribute::Convergent);
+ CGF.EmitRuntimeCall(F);
}
/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
@@ -624,9 +651,10 @@ static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
llvm::Value *NumThreads) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
- CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
- Args);
+ llvm::Function *F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier);
+ F->addFnAttr(llvm::Attribute::Convergent);
+ CGF.EmitRuntimeCall(F, Args);
}
/// Synchronize all GPU threads in a block.
@@ -1965,10 +1993,20 @@ getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
if (!Dir)
return;
for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
- for (const Expr *E : C->getVarRefs()) {
- const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
- Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
- }
+ for (const Expr *E : C->getVarRefs())
+ Vars.push_back(getPrivateItem(E));
+ }
+}
+
+/// Get list of reduction variables from the teams ... directives.
+static void
+getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
+ llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
+ assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
+ "expected teams directive.");
+ for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+ for (const Expr *E : C->privates())
+ Vars.push_back(getPrivateItem(E));
}
}
@@ -1978,13 +2016,22 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
SourceLocation Loc = D.getBeginLoc();
const RecordDecl *GlobalizedRD = nullptr;
- llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
+ llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
+ // Globalize team reductions variable unconditionally in all modes.
+ getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
- getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates);
- if (!LastPrivates.empty())
+ getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
+ if (!LastPrivatesReductions.empty()) {
GlobalizedRD = ::buildRecordForGlobalizedVars(
- CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
+ CGM.getContext(), llvm::None, LastPrivatesReductions,
+ MappedDeclsFields);
+ }
+ } else if (!LastPrivatesReductions.empty()) {
+ assert(!TeamAndReductions.first &&
+ "Previous team declaration is not expected.");
+ TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
+ std::swap(TeamAndReductions.second, LastPrivatesReductions);
}
// Emit target region as a standalone region.
@@ -2162,7 +2209,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
/*Volatile=*/false, Int16Ty, Loc);
auto *StaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
- llvm::GlobalValue::WeakAnyLinkage, nullptr);
+ llvm::GlobalValue::CommonLinkage, nullptr);
auto *RecSize = new llvm::GlobalVariable(
CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
@@ -2801,11 +2848,12 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
- Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
- ElemPtr =
+ Address LocalPtr =
+ Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
+ Address LocalElemPtr =
Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
- PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
- PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
+ PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
+ PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
CGF.EmitBranch(PreCondBB);
CGF.EmitBlock(ExitBB);
} else {
@@ -3228,10 +3276,9 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
CGF.SizeTy, /*isSigned=*/true);
Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
- llvm::Value *WidthVal =
- Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
- Int32Ty, SourceLocation()),
- CGF.SizeTy, /*isSigned=*/true);
+ llvm::Value *WidthVal = Bld.CreateIntCast(
+ CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
+ CGF.SizeTy, /*isSigned=*/true);
// The absolute ptr address to the base addr of the next element to copy.
llvm::Value *CumulativeElemBasePtr =
@@ -3305,11 +3352,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
llvm::GlobalVariable *TransferMedium =
M.getGlobalVariable(TransferMediumName);
if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
+ auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
TransferMedium = new llvm::GlobalVariable(
- M, Ty,
- /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
+ M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
llvm::Constant::getNullValue(Ty), TransferMediumName,
/*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
SharedAddressSpace);
@@ -3327,7 +3373,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, SourceLocation()),
+ C.VoidPtrTy, Loc),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
@@ -3337,121 +3383,150 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
// Warp master copies reduce element to transfer medium in __shared__
// memory.
//
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr =
- Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = (type[i]*)(elemptrptr)
- Address ElemPtr =
- Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
- ElemPtr = Bld.CreateElementBitCast(
- ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
- Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
- // Casting to actual data type.
- // MediumPtr = (type[i]*)MediumPtrAddr;
- MediumPtr = Bld.CreateElementBitCast(
- MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // elem = *elemptr
- //*MediumPtr = elem
- if (Private->getType()->isScalarType()) {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
- Private->getType(), Loc);
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
- Private->getType());
- } else {
- CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- CGF.MakeAddrLValue(MediumPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- }
-
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
+ unsigned RealTySize =
+ C.getTypeSizeInChars(Private->getType())
+ .alignTo(C.getTypeAlignInChars(Private->getType()))
+ .getQuantity();
+ for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
+ unsigned NumIters = RealTySize / TySize;
+ if (NumIters == 0)
+ continue;
+ QualType CType = C.getIntTypeForBitwidth(
+ C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
+ llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
+ CharUnits Align = CharUnits::fromQuantity(TySize);
+ llvm::Value *Cnt = nullptr;
+ Address CntAddr = Address::invalid();
+ llvm::BasicBlock *PrecondBB = nullptr;
+ llvm::BasicBlock *ExitBB = nullptr;
+ if (NumIters > 1) {
+ CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
+ CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
+ /*Volatile=*/false, C.IntTy);
+ PrecondBB = CGF.createBasicBlock("precond");
+ ExitBB = CGF.createBasicBlock("exit");
+ llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(PrecondBB);
+ Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
+ llvm::Value *Cmp =
+ Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
+ Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
+ CGF.EmitBlock(BodyBB);
+ }
+ llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
+ llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
+ llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- CGF.EmitBlock(MergeBB);
+ // if (lane_id == 0)
+ llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
+ Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+ CGF.EmitBlock(ThenBB);
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
+ // Reduce element = LocalReduceList[i]
+ Address ElemPtrPtrAddr =
+ Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+ llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
+ ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Address ElemPtr = Address(ElemPtrPtr, Align);
+ ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
+ if (NumIters > 1) {
+ ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
+ ElemPtr.getAlignment());
+ }
- llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
- NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
- // named_barrier_sync(ParallelBarrierID, num_active_threads)
- syncParallelThreads(CGF, NumActiveThreads);
+ // Get pointer to location in transfer medium.
+ // MediumPtr = &medium[warp_id]
+ llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
+ TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
+ Address MediumPtr(MediumPtrVal, Align);
+ // Casting to actual data type.
+ // MediumPtr = (CopyType*)MediumPtrAddr;
+ MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
+
+ // elem = *elemptr
+ //*MediumPtr = elem
+ llvm::Value *Elem =
+ CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
+ // Store the source element value to the dest element address.
+ CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
+
+ Bld.CreateBr(MergeBB);
+
+ CGF.EmitBlock(ElseBB);
+ Bld.CreateBr(MergeBB);
+
+ CGF.EmitBlock(MergeBB);
+
+ Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
+ llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
+ AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
+
+ llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
+ NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
+ // named_barrier_sync(ParallelBarrierID, num_active_threads)
+ syncParallelThreads(CGF, NumActiveThreads);
+
+ //
+ // Warp 0 copies reduce element from transfer medium.
+ //
+ llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
+ llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
+ llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
+
+ // Up to 32 threads in warp 0 are active.
+ llvm::Value *IsActiveThread =
+ Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+ Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+ CGF.EmitBlock(W0ThenBB);
+
+ // SrcMediumPtr = &medium[tid]
+ llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
+ TransferMedium,
+ {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
+ Address SrcMediumPtr(SrcMediumPtrVal, Align);
+ // SrcMediumVal = *SrcMediumPtr;
+ SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
+
+ // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+ Address TargetElemPtrPtr =
+ Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+ llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
+ TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
+ Address TargetElemPtr = Address(TargetElemPtrVal, Align);
+ TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
+ if (NumIters > 1) {
+ TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
+ TargetElemPtr.getAlignment());
+ }
- //
- // Warp 0 copies reduce element from transfer medium.
- //
- llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
- // Up to 32 threads in warp 0 are active.
- llvm::Value *IsActiveThread =
- Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
- Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
- CGF.EmitBlock(W0ThenBB);
-
- // SrcMediumPtr = &medium[tid]
- llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
- Address SrcMediumPtr(SrcMediumPtrVal,
- C.getTypeAlignInChars(Private->getType()));
- // SrcMediumVal = *SrcMediumPtr;
- SrcMediumPtr = Bld.CreateElementBitCast(
- SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
- Address TargetElemPtrPtr =
- Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
- llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
- TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- Address TargetElemPtr =
- Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
- TargetElemPtr = Bld.CreateElementBitCast(
- TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // *TargetElemPtr = SrcMediumVal;
- if (Private->getType()->isScalarType()) {
- llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
- SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
+ // *TargetElemPtr = SrcMediumVal;
+ llvm::Value *SrcMediumValue =
+ CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- Private->getType());
- } else {
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
- CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- }
- Bld.CreateBr(W0MergeBB);
+ CType);
+ Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0ElseBB);
- Bld.CreateBr(W0MergeBB);
+ CGF.EmitBlock(W0ElseBB);
+ Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0MergeBB);
+ CGF.EmitBlock(W0MergeBB);
- // While warp 0 copies values from transfer medium, all other warps must
- // wait.
- syncParallelThreads(CGF, NumActiveThreads);
+ // While warp 0 copies values from transfer medium, all other warps must
+ // wait.
+ syncParallelThreads(CGF, NumActiveThreads);
+ if (NumIters > 1) {
+ Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
+ CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
+ CGF.EmitBranch(PrecondBB);
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ExitBB);
+ }
+ RealTySize %= TySize;
+ }
++Idx;
}
@@ -3926,16 +4001,17 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
- bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
- assert((TeamsReduction || ParallelReduction || SimdReduction) &&
- "Invalid reduction selection in emitReduction.");
if (Options.SimpleReduction) {
+ assert(!TeamsReduction && !ParallelReduction &&
+ "Invalid reduction selection in emitReduction.");
CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
ReductionOps, Options);
return;
}
+ assert((TeamsReduction || ParallelReduction) &&
+ "Invalid reduction selection in emitReduction.");
ASTContext &C = CGM.getContext();
// 1. Build a list of reduction variables.
@@ -3993,24 +4069,20 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
llvm::Value *InterWarpCopyFn =
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
- llvm::Value *Args[] = {ThreadId,
- CGF.Builder.getInt32(RHSExprs.size()),
- ReductionArrayTySize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn};
+ llvm::Value *Res;
+ if (ParallelReduction) {
+ llvm::Value *Args[] = {ThreadId,
+ CGF.Builder.getInt32(RHSExprs.size()),
+ ReductionArrayTySize,
+ RL,
+ ShuffleAndReduceFn,
+ InterWarpCopyFn};
- llvm::Value *Res = nullptr;
- if (ParallelReduction)
Res = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
Args);
- else if (SimdReduction)
- Res = CGF.EmitRuntimeCall(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
- Args);
-
- if (TeamsReduction) {
+ } else {
+ assert(TeamsReduction && "expected teams reduction.");
llvm::Value *ScratchPadCopyFn =
emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
@@ -4029,18 +4101,18 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
Args);
}
- // 5. Build switch(res)
- llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
- llvm::SwitchInst *SwInst =
- CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
+ // 5. Build if (res == 1)
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
+ llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
+ llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
+ Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
+ CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
- // 6. Build case 1: where we have reduced values in the master
+ // 6. Build then branch: where we have reduced values in the master
// thread in each team.
// __kmpc_end_reduce{_nowait}(<gtid>);
// break;
- llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
- SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
- CGF.EmitBlock(Case1BB);
+ CGF.EmitBlock(ThenBB);
// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
llvm::Value *EndArgs[] = {ThreadId};
@@ -4064,8 +4136,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
EndArgs);
RCG.setAction(Action);
RCG(CGF);
- CGF.EmitBranch(DefaultBB);
- CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
}
const VarDecl *
@@ -4292,6 +4365,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
+ assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
+ "Team is set but not processed.");
const Stmt *Body = nullptr;
bool NeedToDelayGlobalization = false;
if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
@@ -4307,10 +4382,12 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
}
if (!Body)
return;
- CheckVarsEscapingDeclContext VarChecker(CGF);
+ CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
VarChecker.Visit(Body);
const RecordDecl *GlobalizedVarsRecord =
VarChecker.getGlobalizedRecord(IsInTTDRegion);
+ TeamAndReductions.first = nullptr;
+ TeamAndReductions.second.clear();
ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
VarChecker.getEscapedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
@@ -4331,7 +4408,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
}
if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
- CheckVarsEscapingDeclContext VarChecker(CGF);
+ CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
VarChecker.Visit(Body);
I->getSecond().SecondaryGlobalRecord =
VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
@@ -4583,7 +4660,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMStaticTy,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
llvm::Constant::getNullValue(LLVMStaticTy),
"_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
@@ -4609,7 +4686,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMArr2Ty,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
llvm::Constant::getNullValue(LLVMArr2Ty),
"_openmp_static_glob_rd_$_");
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index a157e421e06..aff9cf21135 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -431,6 +431,10 @@ private:
/// Shared pointer for the global memory in the global memory buffer used for
/// the given kernel.
llvm::GlobalVariable *KernelStaticGlobalized = nullptr;
+ /// Pair of the Non-SPMD team and all reductions variables in this team
+ /// region.
+ std::pair<const Decl *, llvm::SmallVector<const ValueDecl *, 4>>
+ TeamAndReductions;
};
} // CodeGen namespace.
OpenPOWER on IntegriCloud