diff options
-rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 217 | ||||
-rw-r--r-- | polly/test/GPGPU/double-parallel-loop.ll | 44 | ||||
-rw-r--r-- | polly/test/GPGPU/host-control-flow.ll | 67 | ||||
-rw-r--r-- | polly/test/GPGPU/kernel-params-only-some-arrays.ll | 6 |
4 files changed, 306 insertions, 28 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 1621252ba10..8b4d2220297 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -18,6 +18,7 @@ #include "polly/LinkAllPasses.h" #include "polly/Options.h" #include "polly/ScopInfo.h" +#include "polly/Support/SCEVValidator.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" @@ -61,17 +62,37 @@ static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", /// This function is a callback for to generate the ast expressions for each /// of the scheduled ScopStmts. static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( - void *Stmt, isl_ast_build *Build, + void *StmtT, isl_ast_build *Build, isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, isl_id *Id, void *User), void *UserIndex, isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), - void *User_expr) { + void *UserExpr) { - // TODO: Implement the AST expression generation. For now we just return a - // nullptr to ensure that we do not free uninitialized pointers. + ScopStmt *Stmt = (ScopStmt *)StmtT; - return nullptr; + isl_ctx *Ctx; + + if (!Stmt || !Build) + return NULL; + + Ctx = isl_ast_build_get_ctx(Build); + isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); + + for (MemoryAccess *Acc : *Stmt) { + isl_map *AddrFunc = Acc->getAddressFunction(); + AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); + isl_id *RefId = Acc->getId(); + isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); + isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); + MPA = isl_multi_pw_aff_coalesce(MPA); + MPA = FunctionIndex(MPA, RefId, UserIndex); + isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); + Access = FunctionExpr(Access, RefId, UserExpr); + RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); + } + + return RefToExpr; } /// Generate code for a GPU specific isl AST. @@ -86,7 +107,9 @@ public: GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT, Scop &S, gpu_prog *Prog) - : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {} + : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { + getExprBuilder().setIDToSAI(&IDToSAI); + } private: /// A module containing GPU code. @@ -108,6 +131,8 @@ private: /// By releasing this set all isl_ids will be freed. std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; + IslExprBuilder::IDToScopArrayInfoTy IDToSAI; + /// Create code for user-defined AST nodes. /// /// These AST nodes can be of type: @@ -121,6 +146,13 @@ private: /// @param UserStmt The ast node to generate code for. virtual void createUser(__isl_take isl_ast_node *UserStmt); + /// Find llvm::Values referenced in GPU kernel. + /// + /// @param Kernel The kernel to scan for llvm::Values + /// + /// @returns A set of values referenced by the kernel. + SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); + /// Create GPU kernel. /// /// Code generate the kernel described by @p KernelStmt. @@ -135,7 +167,9 @@ private: /// start block of this newly created function. /// /// @param Kernel The kernel to generate code for. - void createKernelFunction(ppcg_kernel *Kernel); + /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + void createKernelFunction(ppcg_kernel *Kernel, + SetVector<Value *> &SubtreeValues); /// Create the declaration of a kernel function. /// @@ -147,14 +181,23 @@ private: /// - Other LLVM Value references (TODO) /// /// @param Kernel The kernel to generate the function declaration for. + /// @param SubtreeValues The set of llvm::Values referenced by this kernel. + /// /// @returns The newly declared function. - Function *createKernelFunctionDecl(ppcg_kernel *Kernel); + Function *createKernelFunctionDecl(ppcg_kernel *Kernel, + SetVector<Value *> &SubtreeValues); /// Insert intrinsic functions to obtain thread and block ids. /// /// @param The kernel to generate the intrinsic functions for. void insertKernelIntrinsics(ppcg_kernel *Kernel); + /// Create code for a ScopStmt called in @p Expr. + /// + /// @param Expr The expression containing the call. + /// @param KernelStmt The kernel statement referenced in the call. + void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); + /// Create an in-kernel synchronization call. void createKernelSync(); @@ -201,8 +244,7 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { switch (KernelStmt->type) { case ppcg_kernel_domain: - // TODO Create kernel user stmt - isl_ast_expr_free(Expr); + createScopStmt(Expr, KernelStmt); isl_ast_node_free(UserStmt); return; case ppcg_kernel_copy: @@ -222,30 +264,143 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { return; } +void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, + ppcg_kernel_stmt *KernelStmt) { + auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; + isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; + + LoopToScevMapT LTS; + LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); + + createSubstitutions(Expr, Stmt, LTS); + + if (Stmt->isBlockStmt()) + BlockGen.copyStmt(*Stmt, LTS, Indexes); + else + assert(0 && "Region statement not supported\n"); +} + void GPUNodeBuilder::createKernelSync() { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); Builder.CreateCall(Sync, {}); } +/// Collect llvm::Values referenced from @p Node +/// +/// This function only applies to isl_ast_nodes that are user_nodes referring +/// to a ScopStmt. All other node types are ignore. +/// +/// @param Node The node to collect references for. +/// @param User A user pointer used as storage for the data that is collected. +/// +/// @returns isl_bool_true if data could be collected successfully. +isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { + if (isl_ast_node_get_type(Node) != isl_ast_node_user) + return isl_bool_true; + + isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); + isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); + isl_id *Id = isl_ast_expr_get_id(StmtExpr); + const char *Str = isl_id_get_name(Id); + isl_id_free(Id); + isl_ast_expr_free(StmtExpr); + isl_ast_expr_free(Expr); + + if (!isPrefix(Str, "Stmt")) + return isl_bool_true; + + Id = isl_ast_node_get_annotation(Node); + auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); + auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; + isl_id_free(Id); + + addReferencesFromStmt(Stmt, User); + + return isl_bool_true; +} + +SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { + SetVector<Value *> SubtreeValues; + SetVector<const SCEV *> SCEVs; + SetVector<const Loop *> Loops; + SubtreeReferences References = { + LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; + + for (const auto &I : IDToValue) + SubtreeValues.insert(I.second); + + isl_ast_node_foreach_descendant_top_down( + Kernel->tree, collectReferencesInGPUStmt, &References); + + for (const SCEV *Expr : SCEVs) + findValues(Expr, SE, SubtreeValues); + + for (auto &SAI : S.arrays()) + SubtreeValues.remove(SAI.second->getBasePtr()); + + isl_space *Space = S.getParamSpace(); + for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { + isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); + assert(IDToValue.count(Id)); + Value *Val = IDToValue[Id]; + SubtreeValues.remove(Val); + isl_id_free(Id); + } + isl_space_free(Space); + + for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { + isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); + assert(IDToValue.count(Id)); + Value *Val = IDToValue[Id]; + SubtreeValues.remove(Val); + isl_id_free(Id); + } + + return SubtreeValues; +} + void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id *Id = isl_ast_node_get_annotation(KernelStmt); ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); isl_id_free(Id); isl_ast_node_free(KernelStmt); + SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); + assert(Kernel->tree && "Device AST of kernel node is empty"); Instruction &HostInsertPoint = *Builder.GetInsertPoint(); IslExprBuilder::IDToValueTy HostIDs = IDToValue; + ValueMapT HostValueMap = ValueMap; + + SetVector<const Loop *> Loops; + + // Create for all loops we depend on values that contain the current loop + // iteration. These values are necessary to generate code for SCEVs that + // depend on such loops. As a result we need to pass them to the subfunction. + for (const Loop *L : Loops) { + const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), + SE.getUnknown(Builder.getInt64(1)), + L, SCEV::FlagAnyWrap); + Value *V = generateSCEV(OuterLIV); + OutsideLoopIterations[L] = SE.getUnknown(V); + SubtreeValues.insert(V); + } - createKernelFunction(Kernel); + createKernelFunction(Kernel, SubtreeValues); create(isl_ast_node_copy(Kernel->tree)); Builder.SetInsertPoint(&HostInsertPoint); IDToValue = HostIDs; + ValueMap = HostValueMap; + ScalarMap.clear(); + PHIOpMap.clear(); + EscapeMap.clear(); + IDToSAI.clear(); + finalizeKernelFunction(); } @@ -263,7 +418,9 @@ static std::string computeNVPTXDataLayout(bool is64Bit) { return Ret; } -Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) { +Function * +GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, + SetVector<Value *> &SubtreeValues) { std::vector<Type *> Args; std::string Identifier = "kernel_" + std::to_string(Kernel->id); @@ -284,6 +441,9 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) { for (long i = 0; i < NumVars; i++) Args.push_back(Builder.getInt64Ty()); + for (auto *V : SubtreeValues) + Args.push_back(V->getType()); + auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, GPUModule.get()); @@ -294,7 +454,27 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) { if (!ppcg_kernel_requires_array_argument(Kernel, i)) continue; - Arg->setName(Prog->array[i].name); + Arg->setName(Kernel->array[i].array->name); + + isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); + const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); + Type *EleTy = SAI->getElementType(); + Value *Val = &*Arg; + SmallVector<const SCEV *, 4> Sizes; + isl_ast_build *Build = + isl_ast_build_from_context(isl_set_copy(Prog->context)); + for (long j = 1; j < Kernel->array[i].array->n_index; j++) { + isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( + Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); + auto V = ExprBuilder.create(DimSize); + Sizes.push_back(SE.getSCEV(V)); + } + const ScopArrayInfo *SAIRep = + S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); + + isl_ast_build_free(Build); + isl_id_free(Id); + IDToSAI[Id] = SAIRep; Arg++; } @@ -314,6 +494,12 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) { Arg++; } + for (auto *V : SubtreeValues) { + Arg->setName(V->getName()); + ValueMap[V] = &*Arg; + Arg++; + } + return FN; } @@ -346,14 +532,15 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { } } -void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) { +void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, + SetVector<Value *> &SubtreeValues) { std::string Identifier = "kernel_" + std::to_string(Kernel->id); GPUModule.reset(new Module(Identifier, Builder.getContext())); GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); - Function *FN = createKernelFunctionDecl(Kernel); + Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); BasicBlock *PrevBlock = Builder.GetInsertBlock(); auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index f9b3f1fa6c6..2eee8aaaa90 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -105,18 +105,52 @@ ; KERNEL-IR-NEXT: %t1 = zext i32 %3 to i64 ; KERNEL-IR-NEXT: br label %polly.loop_preheader -; KERNEL-IR-LABEL: polly.loop_exit: +; KERNEL-IR-LABEL: polly.loop_exit: ; preds = %polly.stmt.bb5 ; KERNEL-IR-NEXT: ret void -; KERNEL-IR-LABEL: polly.loop_header: -; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] +; KERNEL-IR-LABEL: polly.loop_header: ; preds = %polly.stmt.bb5, %polly.loop_preheader +; KERNEL-IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ] +; KERNEL-IR-NEXT: %4 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %5 = add nsw i64 %4, %t0 +; KERNEL-IR-NEXT: %6 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %7 = add nsw i64 %6, %t1 +; KERNEL-IR-NEXT: %8 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %9 = add nsw i64 %7, %8 +; KERNEL-IR-NEXT: br label %polly.stmt.bb5 + +; KERNEL-IR-LABEL: polly.stmt.bb5: ; preds = %polly.loop_header +; KERNEL-IR-NEXT: %10 = mul i64 %9, %5 +; KERNEL-IR-NEXT: %p_tmp6 = sitofp i64 %10 to float +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %11 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %12 = add nsw i64 %11, %t0 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024 +; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t1 +; KERNEL-IR-NEXT: %15 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %16 = add nsw i64 %14, %15 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16 +; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A +; KERNEL-IR-NEXT: %tmp8_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4 +; KERNEL-IR-NEXT: %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024 +; KERNEL-IR-NEXT: %19 = mul nsw i64 32, %b1 +; KERNEL-IR-NEXT: %20 = add nsw i64 %19, %t1 +; KERNEL-IR-NEXT: %21 = mul nsw i64 16, %polly.indvar +; KERNEL-IR-NEXT: %22 = add nsw i64 %20, %21 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22 +; KERNEL-IR-NEXT: %polly.access.MemRef_A4 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3 +; KERNEL-IR-NEXT: store float %p_tmp9, float* %polly.access.MemRef_A4, align 4 ; KERNEL-IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1 ; KERNEL-IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 0 ; KERNEL-IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit -; KERNEL-IR-LABEL: polly.loop_preheader: +; KERNEL-IR-LABEL: polly.loop_preheader: ; preds = %entry ; KERNEL-IR-NEXT: br label %polly.loop_header -; KERNEL-IR-NEXT: } + ; void double_parallel_loop(float A[][1024]) { ; for (long i = 0; i < 1024; i++) diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll index d3ec4bbce25..911d5cbc896 100644 --- a/polly/test/GPGPU/host-control-flow.ll +++ b/polly/test/GPGPU/host-control-flow.ll @@ -34,27 +34,82 @@ ; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98 ; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit -; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) { -; KERNEL-IR-NEXT: entry: +; KERNEL-IR: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) { +; KERNEL-IR-LABEL: entry: ; KERNEL-IR-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; KERNEL-IR-NEXT: %b0 = zext i32 %0 to i64 ; KERNEL-IR-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; KERNEL-IR-NEXT: %t0 = zext i32 %1 to i64 ; KERNEL-IR-NEXT: br label %polly.cond -; KERNEL-IR-LABEL: polly.cond: +; KERNEL-IR-LABEL: polly.cond: ; preds = %entry ; KERNEL-IR-NEXT: %2 = mul nsw i64 32, %b0 ; KERNEL-IR-NEXT: %3 = add nsw i64 %2, %t0 ; KERNEL-IR-NEXT: %4 = icmp sle i64 %3, 97 ; KERNEL-IR-NEXT: br i1 %4, label %polly.then, label %polly.else -; KERNEL-IR-LABEL: polly.merge: +; KERNEL-IR-LABEL: polly.merge: ; preds = %polly.else, %polly.stmt.for.body3 ; KERNEL-IR-NEXT: ret void -; KERNEL-IR-LABEL: polly.then: +; KERNEL-IR-LABEL: polly.then: ; preds = %polly.cond +; KERNEL-IR-NEXT: %5 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %6 = add nsw i64 %5, %t0 +; KERNEL-IR-NEXT: br label %polly.stmt.for.body3 + +; KERNEL-IR-LABEL: polly.stmt.for.body3: ; preds = %polly.then +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %pexp.pdiv_r = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100 +; KERNEL-IR-NEXT: %7 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %8 = add nsw i64 %7, %t0 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8 +; KERNEL-IR-NEXT: %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A +; KERNEL-IR-NEXT: %tmp_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4 +; KERNEL-IR-NEXT: %9 = add i64 %6, 1 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %pexp.pdiv_r2 = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100 +; KERNEL-IR-NEXT: %10 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %11 = add nsw i64 %10, %t0 +; KERNEL-IR-NEXT: %12 = add nsw i64 %11, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %12 +; KERNEL-IR-NEXT: %polly.access.MemRef_A5 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4 +; KERNEL-IR-NEXT: %tmp2_p_scalar_ = load float, float* %polly.access.MemRef_A5, align 4 +; KERNEL-IR-NEXT: %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_ +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A6 = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %pexp.pdiv_r7 = urem i64 %c0, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100 +; KERNEL-IR-NEXT: %13 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %14 = add nsw i64 %13, %t0 +; KERNEL-IR-NEXT: %15 = add nsw i64 %14, 2 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %15 +; KERNEL-IR-NEXT: %polly.access.MemRef_A10 = getelementptr float, float* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9 +; KERNEL-IR-NEXT: %tmp3_p_scalar_ = load float, float* %polly.access.MemRef_A10, align 4 +; KERNEL-IR-NEXT: %p_add12 = fadd float %p_add, %tmp3_p_scalar_ +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A11 = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %16 = add nsw i64 %c0, 1 +; KERNEL-IR-NEXT: %pexp.pdiv_r12 = urem i64 %16, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100 +; KERNEL-IR-NEXT: %17 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %18 = add nsw i64 %17, %t0 +; KERNEL-IR-NEXT: %19 = add nsw i64 %18, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %19 +; KERNEL-IR-NEXT: %polly.access.MemRef_A15 = getelementptr float, float* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14 +; KERNEL-IR-NEXT: %tmp4_p_scalar_ = load float, float* %polly.access.MemRef_A15, align 4 +; KERNEL-IR-NEXT: %p_add17 = fadd float %tmp4_p_scalar_, %p_add12 +; KERNEL-IR-NEXT: %polly.access.cast.MemRef_A16 = bitcast i8* %MemRef_A to float* +; KERNEL-IR-NEXT: %20 = add nsw i64 %c0, 1 +; KERNEL-IR-NEXT: %pexp.pdiv_r17 = urem i64 %20, 2 +; KERNEL-IR-NEXT: %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100 +; KERNEL-IR-NEXT: %21 = mul nsw i64 32, %b0 +; KERNEL-IR-NEXT: %22 = add nsw i64 %21, %t0 +; KERNEL-IR-NEXT: %23 = add nsw i64 %22, 1 +; KERNEL-IR-NEXT: %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %23 +; KERNEL-IR-NEXT: %polly.access.MemRef_A20 = getelementptr float, float* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19 +; KERNEL-IR-NEXT: store float %p_add17, float* %polly.access.MemRef_A20, align 4 ; KERNEL-IR-NEXT: br label %polly.merge -; KERNEL-IR-LABEL: polly.else: +; KERNEL-IR-LABEL: polly.else: ; preds = %polly.cond ; KERNEL-IR-NEXT: br label %polly.merge ; KERNEL-IR-NEXT: } diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll index b7eafb71124..0d36c54dd8b 100644 --- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll +++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll @@ -23,7 +23,8 @@ ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 ; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; KERNEL-NEXT: %t0 = zext i32 %1 to i64 -; KERNEL-NEXT: ret void + +; KERNEL: ret void ; KERNEL-NEXT: } ; KERNEL: ; ModuleID = 'kernel_1' @@ -37,7 +38,8 @@ ; KERNEL-NEXT: %b0 = zext i32 %0 to i64 ; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; KERNEL-NEXT: %t0 = zext i32 %1 to i64 -; KERNEL-NEXT: ret void + +; KERNEL: ret void ; KERNEL-NEXT: } target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" |