diff options
11 files changed, 660 insertions, 284 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 8a984548927..da1c0d2a3a9 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -188,6 +188,28 @@ enum NamedBarrier : unsigned { NB_Parallel = 1, }; +static const ValueDecl *getPrivateItem(const Expr *RefExpr) { + RefExpr = RefExpr->IgnoreParens(); + if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { + const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + RefExpr = Base; + } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { + const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) + Base = TempOASE->getBase()->IgnoreParenImpCasts(); + while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + RefExpr = Base; + } + RefExpr = RefExpr->IgnoreParenImpCasts(); + if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) + return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); + const auto *ME = cast<MemberExpr>(RefExpr); + return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); +} + typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy; static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { return P1.first > P2.first; @@ -394,7 +416,10 @@ class CheckVarsEscapingDeclContext final } public: - CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {} + CheckVarsEscapingDeclContext(CodeGenFunction &CGF, + ArrayRef<const ValueDecl *> TeamsReductions) + : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { + } virtual ~CheckVarsEscapingDeclContext() = default; void VisitDeclStmt(const DeclStmt *S) { if (!S) @@ -614,8 +639,10 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { /// Get barrier to synchronize all threads in a block. static void getNVPTXCTABarrier(CodeGenFunction &CGF) { - CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); + llvm::Function *F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0); + F->addFnAttr(llvm::Attribute::Convergent); + CGF.EmitRuntimeCall(F); } /// Get barrier #ID to synchronize selected (multiple of warp size) threads in @@ -624,9 +651,10 @@ static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; - CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration( - &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier), - Args); + llvm::Function *F = llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier); + F->addFnAttr(llvm::Attribute::Convergent); + CGF.EmitRuntimeCall(F, Args); } /// Synchronize all GPU threads in a block. @@ -1965,10 +1993,20 @@ getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, if (!Dir) return; for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { - for (const Expr *E : C->getVarRefs()) { - const auto *DE = cast<DeclRefExpr>(E->IgnoreParens()); - Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl())); - } + for (const Expr *E : C->getVarRefs()) + Vars.push_back(getPrivateItem(E)); + } +} + +/// Get list of reduction variables from the teams ... directives. +static void +getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, + llvm::SmallVectorImpl<const ValueDecl *> &Vars) { + assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && + "expected teams directive."); + for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { + for (const Expr *E : C->privates()) + Vars.push_back(getPrivateItem(E)); } } @@ -1978,13 +2016,22 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( SourceLocation Loc = D.getBeginLoc(); const RecordDecl *GlobalizedRD = nullptr; - llvm::SmallVector<const ValueDecl *, 4> LastPrivates; + llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; + // Globalize team reductions variable unconditionally in all modes. + getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { - getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates); - if (!LastPrivates.empty()) + getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); + if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( - CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields); + CGM.getContext(), llvm::None, LastPrivatesReductions, + MappedDeclsFields); + } + } else if (!LastPrivatesReductions.empty()) { + assert(!TeamAndReductions.first && + "Previous team declaration is not expected."); + TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); + std::swap(TeamAndReductions.second, LastPrivatesReductions); } // Emit target region as a standalone region. @@ -2162,7 +2209,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, /*Volatile=*/false, Int16Ty, Loc); auto *StaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::WeakAnyLinkage, nullptr); + llvm::GlobalValue::CommonLinkage, nullptr); auto *RecSize = new llvm::GlobalVariable( CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, nullptr, @@ -2801,11 +2848,12 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), IntType, Offset, Loc); CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); - Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); - ElemPtr = + Address LocalPtr = + Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize)); + Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize)); - PhiSrc->addIncoming(Ptr.getPointer(), ThenBB); - PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB); + PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); + PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); CGF.EmitBranch(PreCondBB); CGF.EmitBlock(ExitBB); } else { @@ -3228,10 +3276,9 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM, CGF.SizeTy, /*isSigned=*/true); Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); - llvm::Value *WidthVal = - Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, - Int32Ty, SourceLocation()), - CGF.SizeTy, /*isSigned=*/true); + llvm::Value *WidthVal = Bld.CreateIntCast( + CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc), + CGF.SizeTy, /*isSigned=*/true); // The absolute ptr address to the base addr of the next element to copy. llvm::Value *CumulativeElemBasePtr = @@ -3305,11 +3352,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, llvm::GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName); if (!TransferMedium) { - auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize); + auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); TransferMedium = new llvm::GlobalVariable( - M, Ty, - /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, + M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty), TransferMediumName, /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, SharedAddressSpace); @@ -3327,7 +3373,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, Address LocalReduceList( Bld.CreatePointerBitCastOrAddrSpaceCast( CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, SourceLocation()), + C.VoidPtrTy, Loc), CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), CGF.getPointerAlign()); @@ -3337,121 +3383,150 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, // Warp master copies reduce element to transfer medium in __shared__ // memory. // - llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - - // if (lane_id == 0) - llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); - Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); - CGF.EmitBlock(ThenBB); - - // Reduce element = LocalReduceList[i] - Address ElemPtrPtrAddr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( - ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - // elemptr = (type[i]*)(elemptrptr) - Address ElemPtr = - Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); - ElemPtr = Bld.CreateElementBitCast( - ElemPtr, CGF.ConvertTypeForMem(Private->getType())); - - // Get pointer to location in transfer medium. - // MediumPtr = &medium[warp_id] - llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); - Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType())); - // Casting to actual data type. - // MediumPtr = (type[i]*)MediumPtrAddr; - MediumPtr = Bld.CreateElementBitCast( - MediumPtr, CGF.ConvertTypeForMem(Private->getType())); - - // elem = *elemptr - //*MediumPtr = elem - if (Private->getType()->isScalarType()) { - llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, - Private->getType(), Loc); - // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false, - Private->getType()); - } else { - CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), - CGF.MakeAddrLValue(MediumPtr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - } - - Bld.CreateBr(MergeBB); - - CGF.EmitBlock(ElseBB); - Bld.CreateBr(MergeBB); + unsigned RealTySize = + C.getTypeSizeInChars(Private->getType()) + .alignTo(C.getTypeAlignInChars(Private->getType())) + .getQuantity(); + for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { + unsigned NumIters = RealTySize / TySize; + if (NumIters == 0) + continue; + QualType CType = C.getIntTypeForBitwidth( + C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); + llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); + CharUnits Align = CharUnits::fromQuantity(TySize); + llvm::Value *Cnt = nullptr; + Address CntAddr = Address::invalid(); + llvm::BasicBlock *PrecondBB = nullptr; + llvm::BasicBlock *ExitBB = nullptr; + if (NumIters > 1) { + CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); + CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, + /*Volatile=*/false, C.IntTy); + PrecondBB = CGF.createBasicBlock("precond"); + ExitBB = CGF.createBasicBlock("exit"); + llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(PrecondBB); + Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); + llvm::Value *Cmp = + Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); + Bld.CreateCondBr(Cmp, BodyBB, ExitBB); + CGF.EmitBlock(BodyBB); + } + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); - CGF.EmitBlock(MergeBB); + // if (lane_id == 0) + llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); + Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); + CGF.EmitBlock(ThenBB); - Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); - llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( - AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation()); + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + Address ElemPtr = Address(ElemPtrPtr, Align); + ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType); + if (NumIters > 1) { + ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt), + ElemPtr.getAlignment()); + } - llvm::Value *NumActiveThreads = Bld.CreateNSWMul( - NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); - // named_barrier_sync(ParallelBarrierID, num_active_threads) - syncParallelThreads(CGF, NumActiveThreads); + // Get pointer to location in transfer medium. + // MediumPtr = &medium[warp_id] + llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); + Address MediumPtr(MediumPtrVal, Align); + // Casting to actual data type. + // MediumPtr = (CopyType*)MediumPtrAddr; + MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType); + + // elem = *elemptr + //*MediumPtr = elem + llvm::Value *Elem = + CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc); + // Store the source element value to the dest element address. + CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType); + + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); + llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( + AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); + + llvm::Value *NumActiveThreads = Bld.CreateNSWMul( + NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); + // named_barrier_sync(ParallelBarrierID, num_active_threads) + syncParallelThreads(CGF, NumActiveThreads); + + // + // Warp 0 copies reduce element from transfer medium. + // + llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); + + // Up to 32 threads in warp 0 are active. + llvm::Value *IsActiveThread = + Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); + Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); + + CGF.EmitBlock(W0ThenBB); + + // SrcMediumPtr = &medium[tid] + llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, + {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); + Address SrcMediumPtr(SrcMediumPtrVal, Align); + // SrcMediumVal = *SrcMediumPtr; + SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType); + + // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I + Address TargetElemPtrPtr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( + TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); + Address TargetElemPtr = Address(TargetElemPtrVal, Align); + TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType); + if (NumIters > 1) { + TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt), + TargetElemPtr.getAlignment()); + } - // - // Warp 0 copies reduce element from transfer medium. - // - llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); - llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); - llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); - - // Up to 32 threads in warp 0 are active. - llvm::Value *IsActiveThread = - Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); - Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); - - CGF.EmitBlock(W0ThenBB); - - // SrcMediumPtr = &medium[tid] - llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( - TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); - Address SrcMediumPtr(SrcMediumPtrVal, - C.getTypeAlignInChars(Private->getType())); - // SrcMediumVal = *SrcMediumPtr; - SrcMediumPtr = Bld.CreateElementBitCast( - SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType())); - - // TargetElemPtr = (type[i]*)(SrcDataAddr[i]) - Address TargetElemPtrPtr = - Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); - llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( - TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); - Address TargetElemPtr = - Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType())); - TargetElemPtr = Bld.CreateElementBitCast( - TargetElemPtr, CGF.ConvertTypeForMem(Private->getType())); - - // *TargetElemPtr = SrcMediumVal; - if (Private->getType()->isScalarType()) { - llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar( - SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc); + // *TargetElemPtr = SrcMediumVal; + llvm::Value *SrcMediumValue = + CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, - Private->getType()); - } else { - CGF.EmitAggregateCopy( - CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()), - CGF.MakeAddrLValue(TargetElemPtr, Private->getType()), - Private->getType(), AggValueSlot::DoesNotOverlap); - } - Bld.CreateBr(W0MergeBB); + CType); + Bld.CreateBr(W0MergeBB); - CGF.EmitBlock(W0ElseBB); - Bld.CreateBr(W0MergeBB); + CGF.EmitBlock(W0ElseBB); + Bld.CreateBr(W0MergeBB); - CGF.EmitBlock(W0MergeBB); + CGF.EmitBlock(W0MergeBB); - // While warp 0 copies values from transfer medium, all other warps must - // wait. - syncParallelThreads(CGF, NumActiveThreads); + // While warp 0 copies values from transfer medium, all other warps must + // wait. + syncParallelThreads(CGF, NumActiveThreads); + if (NumIters > 1) { + Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); + CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); + CGF.EmitBranch(PrecondBB); + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ExitBB); + } + RealTySize %= TySize; + } ++Idx; } @@ -3926,16 +4001,17 @@ void CGOpenMPRuntimeNVPTX::emitReduction( bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); - bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind); - assert((TeamsReduction || ParallelReduction || SimdReduction) && - "Invalid reduction selection in emitReduction."); if (Options.SimpleReduction) { + assert(!TeamsReduction && !ParallelReduction && + "Invalid reduction selection in emitReduction."); CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps, Options); return; } + assert((TeamsReduction || ParallelReduction) && + "Invalid reduction selection in emitReduction."); ASTContext &C = CGM.getContext(); // 1. Build a list of reduction variables. @@ -3993,24 +4069,20 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Args[] = {ThreadId, - CGF.Builder.getInt32(RHSExprs.size()), - ReductionArrayTySize, - RL, - ShuffleAndReduceFn, - InterWarpCopyFn}; + llvm::Value *Res; + if (ParallelReduction) { + llvm::Value *Args[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn}; - llvm::Value *Res = nullptr; - if (ParallelReduction) Res = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), Args); - else if (SimdReduction) - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait), - Args); - - if (TeamsReduction) { + } else { + assert(TeamsReduction && "expected teams reduction."); llvm::Value *ScratchPadCopyFn = emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc); llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction( @@ -4029,18 +4101,18 @@ void CGOpenMPRuntimeNVPTX::emitReduction( Args); } - // 5. Build switch(res) - llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); - llvm::SwitchInst *SwInst = - CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); + // 5. Build if (res == 1) + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); + llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); + llvm::Value *Cond = CGF.Builder.CreateICmpEQ( + Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); + CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); - // 6. Build case 1: where we have reduced values in the master + // 6. Build then branch: where we have reduced values in the master // thread in each team. // __kmpc_end_reduce{_nowait}(<gtid>); // break; - llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); - SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); - CGF.EmitBlock(Case1BB); + CGF.EmitBlock(ThenBB); // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); llvm::Value *EndArgs[] = {ThreadId}; @@ -4064,8 +4136,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction( EndArgs); RCG.setAction(Action); RCG(CGF); - CGF.EmitBranch(DefaultBB); - CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ExitBB, /*IsFinished=*/true); } const VarDecl * @@ -4292,6 +4365,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); + assert((!TeamAndReductions.first || TeamAndReductions.first == D) && + "Team is set but not processed."); const Stmt *Body = nullptr; bool NeedToDelayGlobalization = false; if (const auto *FD = dyn_cast<FunctionDecl>(D)) { @@ -4307,10 +4382,12 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, } if (!Body) return; - CheckVarsEscapingDeclContext VarChecker(CGF); + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(IsInTTDRegion); + TeamAndReductions.first = nullptr; + TeamAndReductions.second.clear(); ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = VarChecker.getEscapedVariableLengthDecls(); if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) @@ -4331,7 +4408,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { - CheckVarsEscapingDeclContext VarChecker(CGF); + CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); I->getSecond().SecondaryGlobalRecord = VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); @@ -4583,7 +4660,7 @@ void CGOpenMPRuntimeNVPTX::clear() { llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, @@ -4609,7 +4686,7 @@ void CGOpenMPRuntimeNVPTX::clear() { llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); auto *GV = new llvm::GlobalVariable( CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index a157e421e06..aff9cf21135 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -431,6 +431,10 @@ private: /// Shared pointer for the global memory in the global memory buffer used for /// the given kernel. llvm::GlobalVariable *KernelStaticGlobalized = nullptr; + /// Pair of the Non-SPMD team and all reductions variables in this team + /// region. + std::pair<const Decl *, llvm::SmallVector<const ValueDecl *, 4>> + TeamAndReductions; }; } // CodeGen namespace. diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp index 7f822aba921..ed3c88b5771 100644 --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -27,7 +27,7 @@ void test_ds(){ } } // CK1: [[MEM_TY:%.+]] = type { [8 x i8] } -// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8 // CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 71a3ad54917..a84962c4d81 100644 --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -22,7 +22,7 @@ int main(int argc, char **argv) { } // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1 diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp index a3b1b012bf0..2fd837c92b2 100644 --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -72,7 +72,7 @@ int bar(int n){ } // CHECK: [[MEM_TY:%.+]] = type { [4 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp index c292152e658..25a7a15693a 100644 --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -31,7 +31,7 @@ int bar(int n){ } // CHECK: [[MEM_TY:%.+]] = type { [4 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp index 0073db6f288..1687c8ea761 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -9,7 +9,7 @@ #define HEADER // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. -// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of all 3 target regions is set to Spmd Mode. // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0 @@ -73,18 +73,16 @@ int bar(int n){ // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[CMP]], label - // CHECK: [[REDLABEL]] // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align // CHECK: [[EV:%.+]] = load double, double* [[E]], align // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] // CHECK: store double [[ADD]], double* [[E_IN]], align // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] + // CHECK: br label // - // CHECK: [[DEFAULTLABEL]] // CHECK: ret // @@ -187,18 +185,23 @@ int bar(int n){ // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], + // CHECK: br label + // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], + // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 + // CHECK: br i1 [[DONE_COPY]], label // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] // // [[DO_COPY]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align - // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -215,13 +218,13 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], // CHECK: br label {{%?}}[[READ_CONT:.+]] // // CHECK: [[READ_ELSE]] @@ -229,6 +232,9 @@ int bar(int n){ // // CHECK: [[READ_CONT]] // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 + // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], + // CHECK: br label // CHECK: ret @@ -268,10 +274,8 @@ int bar(int n){ // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] - - // CHECK: [[REDLABEL]] + // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[CMP]], label // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align @@ -284,9 +288,8 @@ int bar(int n){ // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] // CHECK: store float [[MUL]], float* [[D_IN]], align // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] + // CHECK: br label // - // CHECK: [[DEFAULTLABEL]] // CHECK: ret // @@ -432,10 +435,10 @@ int bar(int n){ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -452,11 +455,11 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // @@ -471,12 +474,11 @@ int bar(int n){ // [[DO_COPY]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align - // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -493,13 +495,12 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // // CHECK: [[READ_ELSE]] @@ -560,10 +561,9 @@ int bar(int n){ // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) - // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ - // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[CMP]], label - // CHECK: [[REDLABEL]] // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] @@ -587,9 +587,8 @@ int bar(int n){ // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( - // CHECK: br label %[[DEFAULTLABEL]] + // CHECK: br label // - // CHECK: [[DEFAULTLABEL]] // CHECK: ret // @@ -752,10 +751,9 @@ int bar(int n){ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -772,12 +770,11 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // @@ -794,10 +791,10 @@ int bar(int n){ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -814,12 +811,12 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index 8b8e0b0bba7..79315717ae6 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -68,7 +68,7 @@ int bar(int n){ } // CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index 395c10e6632..2e9ceb129b4 100644 --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -63,7 +63,7 @@ int bar(int n){ } // CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1 diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp index 98988a5d4dd..4965a50781c 100644 --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -28,7 +28,7 @@ int main (int argc, char **argv) { } // CK1: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] } -// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} @@ -115,7 +115,7 @@ int main (int argc, char **argv) { } // CK2: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] } -// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4 // CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}} diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index b73b5a8e93e..e2a103ab86d 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -8,13 +8,23 @@ #ifndef HEADER #define HEADER +// CHECK: [[MAP_TY:%.+]] = type { [16 x i8] } + +// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 +// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 +// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1 +// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} +// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 +// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8 + // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. -// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] -// Check that the execution mode of all 3 target regions is set to Generic Mode. -// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1 +// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 template<typename tx> tx ftemplate(int n) { @@ -39,6 +49,7 @@ tx ftemplate(int n) { #pragma omp target #pragma omp teams reduction(|: a) reduction(max: b) + #pragma omp parallel reduction(|: a) reduction(max: b) { a |= 1; b = 99 > b ? 99 : b; @@ -55,9 +66,9 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker() - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]]( + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]]( // // CHECK: {{call|invoke}} void [[T1]]_worker() // @@ -186,18 +197,23 @@ int bar(int n){ // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], + // CHECK: br label + // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], + // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 + // CHECK: br i1 [[DONE_COPY]], label // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] // // [[DO_COPY]] - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align - // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -214,13 +230,13 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], // CHECK: br label {{%?}}[[READ_CONT:.+]] // // CHECK: [[READ_ELSE]] @@ -228,6 +244,9 @@ int bar(int n){ // // CHECK: [[READ_CONT]] // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 + // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], + // CHECK: br label // CHECK: ret // @@ -307,9 +326,9 @@ int bar(int n){ // CHECK: [[REDUCE_CONT]] // CHECK: ret - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]]( + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]]( // // CHECK: {{call|invoke}} void [[T2]]_worker() // @@ -495,10 +514,10 @@ int bar(int n){ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align - // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -515,11 +534,11 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // @@ -534,12 +553,11 @@ int bar(int n){ // [[DO_COPY]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* - // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align - // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -556,13 +574,12 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], - // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align - // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // // CHECK: [[READ_ELSE]] @@ -689,13 +706,60 @@ int bar(int n){ // CHECK: [[REDUCE_CONT]] // CHECK: ret - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker() - - // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]]( + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) + // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], + // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]* + // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0 + // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1 + // CHECK: store i32 0, i32* [[A_ADDR]], + // CHECK: store i16 -32768, i16* [[B_ADDR]], + // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // - // CHECK: {{call|invoke}} void [[T3]]_worker() + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] // - // CHECK: call void @__kmpc_kernel_init( + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // call void @__kmpc_restore_team_static_memory(i16 1) + // CHECK: call void @__kmpc_spmd_kernel_deinit( + + // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) // // CHECK: store i32 0, i32* [[A:%.+]], align // CHECK: store i16 -32768, i16* [[B:%.+]], align @@ -726,7 +790,7 @@ int bar(int n){ // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -757,7 +821,243 @@ int bar(int n){ // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_kernel_deinit( + // CHECK: ret void + + // + // Reduction function + // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret // // Reduction function @@ -919,10 +1219,9 @@ int bar(int n){ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align - // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -939,12 +1238,11 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // @@ -961,10 +1259,10 @@ int bar(int n){ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* // - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align - // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: br label {{%?}}[[COPY_CONT:.+]] // // CHECK: [[COPY_ELSE]] @@ -981,12 +1279,12 @@ int bar(int n){ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] // // CHECK: [[DO_READ]] - // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] - // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* - // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align // CHECK: br label {{%?}}[[READ_CONT:.+]] // |