diff options
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 407 |
1 files changed, 242 insertions, 165 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 8a984548927..da1c0d2a3a9 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -188,6 +188,28 @@ enum NamedBarrier : unsigned { NB_Parallel = 1, }; +static const ValueDecl *getPrivateItem(const Expr *RefExpr) { + RefExpr = RefExpr->IgnoreParens(); + if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { + const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + RefExpr = Base; + } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { + const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) + Base = TempOASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + RefExpr = Base; + } + RefExpr = RefExpr->IgnoreParenImpCasts(); + if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) + return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); + const auto *ME = cast<MemberExpr>(RefExpr); + return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); +} + typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy; static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { return P1.first > P2.first; @@ -394,7 +416,10 @@ class CheckVarsEscapingDeclContext final } public: - CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {} + CheckVarsEscapingDeclContext(CodeGenFunction &CGF, + ArrayRef<const ValueDecl *> TeamsReductions) + : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { + } virtual ~CheckVarsEscapingDeclContext() = default; void VisitDeclStmt(const DeclStmt *S) { if (!S) @@ -614,8 +639,10 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { /// Get barrier to synchronize all threads in a block. static void getNVPTXCTABarrier(CodeGenFunction &CGF) { - CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); + llvm::Function *F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0); + F->addFnAttr(llvm::Attribute::Convergent); + CGF.EmitRuntimeCall(F); } /// Get barrier #ID to synchronize selected (multiple of warp size) threads in @@ -624,9 +651,10 @@ static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; - CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier), - Args); + llvm::Function *F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier); + F->addFnAttr(llvm::Attribute::Convergent); + CGF.EmitRuntimeCall(F, Args); } /// Synchronize all GPU threads in a block. @@ -1965,10 +1993,20 @@ getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, if (!Dir) return; for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { - for (const Expr *E : C->getVarRefs()) { - const auto *DE = cast<DeclRefExpr>(E->IgnoreParens()); - Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl())); - } + for (const Expr *E : C->getVarRefs()) + Vars.push_back(getPrivateItem(E)); + } +} + +/// Get list of reduction variables from the teams ... directives. +static void +getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, + llvm::SmallVectorImpl<const ValueDecl *> &Vars) { + assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && + "expected teams directive."); + for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { + for (const Expr *E : C->privates()) + Vars.push_back(getPrivateItem(E)); } } @@ -1978,13 +2016,22 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( SourceLocation Loc = D.getBeginLoc(); const RecordDecl *GlobalizedRD = nullptr; - llvm::SmallVector<const ValueDecl *, 4> LastPrivates; + llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; + // Globalize team reductions variable unconditionally in all modes. + getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { - getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates); - if (!LastPrivates.empty()) + getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); + if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( - CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields); + CGM.getContext(), llvm::None, LastPrivatesReductions, + MappedDeclsFields); + } + } else if (!LastPrivatesReductions.empty()) { + assert(!TeamAndReductions.first && + "Previous team declaration is not expected."); + TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); + std::swap(TeamAndReductions.second, LastPrivatesReductions); } // Emit target region as a standalone region. @@ -2162,7 +2209,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, /*Volatile=*/false, Int16Ty, Loc); auto *StaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::WeakAnyLinkage, nullptr); + llvm::GlobalValue::CommonLinkage, nullptr); auto *RecSize = new llvm::GlobalVariable( CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, nullptr, @@ -2801,11 +2848,12 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), IntType, Offset, Loc); CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); - ElemPtr = + Address LocalPtr = + Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); + Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize)); - PhiSrc->addIncoming(Ptr.getPointer(), ThenBB); - PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB); + PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); + PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); CGF.EmitBranch(PreCondBB); CGF.EmitBlock(ExitBB); } else { @@ -3228,10 +3276,9 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM, CGF.SizeTy, /*isSigned=*/true); Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); - llvm::Value *WidthVal = - Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, - Int32Ty, SourceLocation()), - CGF.SizeTy, /*isSigned=*/true); + llvm::Value *WidthVal = Bld.CreateIntCast( + CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc), + CGF.SizeTy, /*isSigned=*/true); // The absolute ptr address to the base addr of the next element to copy. llvm::Value *CumulativeElemBasePtr = @@ -3305,11 +3352,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, llvm::GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); if (!TransferMedium) { - auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize); + auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); TransferMedium = new llvm::GlobalVariable( - M, Ty, - /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, + M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty), TransferMediumName, /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, SharedAddressSpace); @@ -3327,7 +3373,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, Address LocalReduceList( Bld.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()), + C.VoidPtrTy, Loc), CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), CGF.getPointerAlign()); @@ -3337,121 +3383,150 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, // Warp master copies reduce element to transfer medium in __shared__ // memory. // - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - // if (lane_id == 0) - llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); - Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); - CGF.EmitBlock(ThenBB); - - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = (type[i]*)(elemptrptr) - Address ElemPtr = - Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); - ElemPtr = Bld.CreateElementBitCast( - ElemPtr, CGF.ConvertTypeForMem(Private->getType())); - - // Get pointer to location in transfer medium. - // MediumPtr = &medium[warp_id] - llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); - Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType())); - // Casting to actual data type. - // MediumPtr = (type[i]*)MediumPtrAddr; - MediumPtr = Bld.CreateElementBitCast( - MediumPtr, CGF.ConvertTypeForMem(Private->getType())); - - // elem = *elemptr - //*MediumPtr = elem - if (Private->getType()->isScalarType()) { - llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, - Private->getType(), Loc); - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false, - Private->getType()); - } else { - CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), - CGF.MakeAddrLValue(MediumPtr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - } - - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); + unsigned RealTySize = + C.getTypeSizeInChars(Private->getType()) + .alignTo(C.getTypeAlignInChars(Private->getType())) + .getQuantity(); + for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { + unsigned NumIters = RealTySize / TySize; + if (NumIters == 0) + continue; + QualType CType = C.getIntTypeForBitwidth( + C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); + llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); + CharUnits Align = CharUnits::fromQuantity(TySize); + llvm::Value *Cnt = nullptr; + Address CntAddr = Address::invalid(); + llvm::BasicBlock *PrecondBB = nullptr; + llvm::BasicBlock *ExitBB = nullptr; + if (NumIters > 1) { + CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); + CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, + /*Volatile=*/false, C.IntTy); + PrecondBB = CGF.createBasicBlock("precond"); + ExitBB = CGF.createBasicBlock("exit"); + llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(PrecondBB); + Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); + llvm::Value *Cmp = + Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); + Bld.CreateCondBr(Cmp, BodyBB, ExitBB); + CGF.EmitBlock(BodyBB); + } + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - CGF.EmitBlock(MergeBB); + // if (lane_id == 0) + llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); + Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); + CGF.EmitBlock(ThenBB); - Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); - llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( - AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation()); + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + Address ElemPtr = Address(ElemPtrPtr, Align); + ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType); + if (NumIters > 1) { + ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt), + ElemPtr.getAlignment()); + } - llvm::Value *NumActiveThreads = Bld.CreateNSWMul( - NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); - // named_barrier_sync(ParallelBarrierID, num_active_threads) - syncParallelThreads(CGF, NumActiveThreads); + // Get pointer to location in transfer medium. + // MediumPtr = &medium[warp_id] + llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); + Address MediumPtr(MediumPtrVal, Align); + // Casting to actual data type. + // MediumPtr = (CopyType*)MediumPtrAddr; + MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType); + + // elem = *elemptr + //*MediumPtr = elem + llvm::Value *Elem = + CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc); + // Store the source element value to the dest element address. + CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType); + + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); + llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( + AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); + + llvm::Value *NumActiveThreads = Bld.CreateNSWMul( + NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); + // named_barrier_sync(ParallelBarrierID, num_active_threads) + syncParallelThreads(CGF, NumActiveThreads); + + // + // Warp 0 copies reduce element from transfer medium. + // + llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); + + // Up to 32 threads in warp 0 are active. + llvm::Value *IsActiveThread = + Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); + Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); + + CGF.EmitBlock(W0ThenBB); + + // SrcMediumPtr = &medium[tid] + llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, + {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); + Address SrcMediumPtr(SrcMediumPtrVal, Align); + // SrcMediumVal = *SrcMediumPtr; + SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); + + // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I + Address TargetElemPtrPtr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( + TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); + Address TargetElemPtr = Address(TargetElemPtrVal, Align); + TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType); + if (NumIters > 1) { + TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt), + TargetElemPtr.getAlignment()); + } - // - // Warp 0 copies reduce element from transfer medium. - // - llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); - - // Up to 32 threads in warp 0 are active. - llvm::Value *IsActiveThread = - Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); - Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); - - CGF.EmitBlock(W0ThenBB); - - // SrcMediumPtr = &medium[tid] - llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); - Address SrcMediumPtr(SrcMediumPtrVal, - C.getTypeAlignInChars(Private->getType())); - // SrcMediumVal = *SrcMediumPtr; - SrcMediumPtr = Bld.CreateElementBitCast( - SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType())); - - // TargetElemPtr = (type[i]*)(SrcDataAddr[i]) - Address TargetElemPtrPtr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( - TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - Address TargetElemPtr = - Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType())); - TargetElemPtr = Bld.CreateElementBitCast( - TargetElemPtr, CGF.ConvertTypeForMem(Private->getType())); - - // *TargetElemPtr = SrcMediumVal; - if (Private->getType()->isScalarType()) { - llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar( - SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc); + // *TargetElemPtr = SrcMediumVal; + llvm::Value *SrcMediumValue = + CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, - Private->getType()); - } else { - CGF.EmitAggregateCopy( - CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()), - CGF.MakeAddrLValue(TargetElemPtr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - } - Bld.CreateBr(W0MergeBB); + CType); + Bld.CreateBr(W0MergeBB); - CGF.EmitBlock(W0ElseBB); - Bld.CreateBr(W0MergeBB); + CGF.EmitBlock(W0ElseBB); + Bld.CreateBr(W0MergeBB); - CGF.EmitBlock(W0MergeBB); + CGF.EmitBlock(W0MergeBB); - // While warp 0 copies values from transfer medium, all other warps must - // wait. - syncParallelThreads(CGF, NumActiveThreads); + // While warp 0 copies values from transfer medium, all other warps must + // wait. + syncParallelThreads(CGF, NumActiveThreads); + if (NumIters > 1) { + Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); + CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); + CGF.EmitBranch(PrecondBB); + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ExitBB); + } + RealTySize %= TySize; + } ++Idx; } @@ -3926,16 +4001,17 @@ void CGOpenMPRuntimeNVPTX::emitReduction( bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); - bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind); - assert((TeamsReduction || ParallelReduction || SimdReduction) && - "Invalid reduction selection in emitReduction."); if (Options.SimpleReduction) { + assert(!TeamsReduction && !ParallelReduction && + "Invalid reduction selection in emitReduction."); CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps, Options); return; } + assert((TeamsReduction || ParallelReduction) && + "Invalid reduction selection in emitReduction."); ASTContext &C = CGM.getContext(); // 1. Build a list of reduction variables. @@ -3993,24 +4069,20 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Args[] = {ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn}; + llvm::Value *Res; + if (ParallelReduction) { + llvm::Value *Args[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn}; - llvm::Value *Res = nullptr; - if (ParallelReduction) Res = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), Args); - else if (SimdReduction) - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait), - Args); - - if (TeamsReduction) { + } else { + assert(TeamsReduction && "expected teams reduction."); llvm::Value *ScratchPadCopyFn = emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc); llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction( @@ -4029,18 +4101,18 @@ void CGOpenMPRuntimeNVPTX::emitReduction( Args); } - // 5. Build switch(res) - llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); - llvm::SwitchInst *SwInst = - CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); + // 5. Build if (res == 1) + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); + llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); + llvm::Value *Cond = CGF.Builder.CreateICmpEQ( + Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); + CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); - // 6. Build case 1: where we have reduced values in the master + // 6. Build then branch: where we have reduced values in the master // thread in each team. // __kmpc_end_reduce{_nowait}(<gtid>); // break; - llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); - SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); - CGF.EmitBlock(Case1BB); + CGF.EmitBlock(ThenBB); // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); llvm::Value *EndArgs[] = {ThreadId}; @@ -4064,8 +4136,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction( EndArgs); RCG.setAction(Action); RCG(CGF); - CGF.EmitBranch(DefaultBB); - CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ExitBB, /*IsFinished=*/true); } const VarDecl * @@ -4292,6 +4365,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); + assert((!TeamAndReductions.first || TeamAndReductions.first == D) && + "Team is set but not processed."); const Stmt *Body = nullptr; bool NeedToDelayGlobalization = false; if (const auto *FD = dyn_cast<FunctionDecl>(D)) { @@ -4307,10 +4382,12 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, } if (!Body) return; - CheckVarsEscapingDeclContext VarChecker(CGF); + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(IsInTTDRegion); + TeamAndReductions.first = nullptr; + TeamAndReductions.second.clear(); ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = VarChecker.getEscapedVariableLengthDecls(); if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) @@ -4331,7 +4408,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { - CheckVarsEscapingDeclContext VarChecker(CGF); + CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); I->getSecond().SecondaryGlobalRecord = VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); @@ -4583,7 +4660,7 @@ void CGOpenMPRuntimeNVPTX::clear() { llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, @@ -4609,7 +4686,7 @@ void CGOpenMPRuntimeNVPTX::clear() { llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( |