diff options
-rw-r--r-- | polly/lib/CodeGen/PPCGCodeGeneration.cpp | 123 | ||||
-rw-r--r-- | polly/test/GPGPU/double-parallel-loop.ll | 14 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-fp128.ll | 39 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-i120.ll | 39 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-i128.ll | 39 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-i3000.ll | 39 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-i80.ll | 40 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-ppc_fp128.ll | 38 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter-x86_fp80.ll | 39 | ||||
-rw-r--r-- | polly/test/GPGPU/scalar-parameter.ll | 346 |
10 files changed, 410 insertions, 346 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index 8b4d2220297..74dd31d91fc 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -19,11 +19,18 @@ #include "polly/Options.h" #include "polly/ScopInfo.h" #include "polly/Support/SCEVValidator.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" #include "isl/union_map.h" @@ -57,6 +64,21 @@ static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", + cl::desc("Dump the kernel assembly code"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, + cl::cat(PollyCategory)); + +static cl::opt<bool> FastMath("polly-acc-fastmath", + cl::desc("Allow unsafe math optimizations"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, + cl::cat(PollyCategory)); + +static cl::opt<std::string> + CudaVersion("polly-acc-cuda-version", + cl::desc("The CUDA version to compile for"), cl::Hidden, + cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); + /// Create the ast expressions for a ScopStmt. /// /// This function is a callback for to generate the ast expressions for each @@ -112,6 +134,12 @@ public: } private: + /// A vector of array base pointers for which a new ScopArrayInfo was created. + /// + /// This vector is used to delete the ScopArrayInfo when it is not needed any + /// more. + std::vector<Value *> LocalArrays; + /// A module containing GPU code. /// /// This pointer is only set in case we are currently generating GPU code. @@ -201,6 +229,26 @@ private: /// Create an in-kernel synchronization call. void createKernelSync(); + /// Create a PTX assembly string for the current GPU kernel. + /// + /// @returns A string containing the corresponding PTX assembly code. + std::string createKernelASM(); + + /// Remove references from the dominator tree to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearDominators(Function *F); + + /// Remove references from scalar evolution to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearScalarEvolution(Function *F); + + /// Remove references from loop info to the kernel function @p F. + /// + /// @param F The function to remove references to. + void clearLoops(Function *F); + /// Finalize the generation of the kernel function. /// /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- @@ -360,6 +408,33 @@ SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { return SubtreeValues; } +void GPUNodeBuilder::clearDominators(Function *F) { + DomTreeNode *N = DT.getNode(&F->getEntryBlock()); + std::vector<BasicBlock *> Nodes; + for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) + Nodes.push_back(I->getBlock()); + + for (BasicBlock *BB : Nodes) + DT.eraseNode(BB); +} + +void GPUNodeBuilder::clearScalarEvolution(Function *F) { + for (BasicBlock &BB : *F) { + Loop *L = LI.getLoopFor(&BB); + if (L) + SE.forgetLoop(L); + } +} + +void GPUNodeBuilder::clearLoops(Function *F) { + for (BasicBlock &BB : *F) { + Loop *L = LI.getLoopFor(&BB); + if (L) + SE.forgetLoop(L); + LI.removeBlock(&BB); + } +} + void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { isl_id *Id = isl_ast_node_get_annotation(KernelStmt); ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); @@ -392,6 +467,11 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { create(isl_ast_node_copy(Kernel->tree)); + Function *F = Builder.GetInsertBlock()->getParent(); + clearDominators(F); + clearScalarEvolution(F); + clearLoops(F); + Builder.SetInsertPoint(&HostInsertPoint); IDToValue = HostIDs; @@ -400,6 +480,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { PHIOpMap.clear(); EscapeMap.clear(); IDToSAI.clear(); + Annotator.resetAlternativeAliasBases(); + for (auto &BasePtr : LocalArrays) + S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); + LocalArrays.clear(); finalizeKernelFunction(); } @@ -471,6 +555,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, } const ScopArrayInfo *SAIRep = S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); + LocalArrays.push_back(Val); isl_ast_build_free(Build); isl_id_free(Id); @@ -555,11 +640,49 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, insertKernelIntrinsics(Kernel); } +std::string GPUNodeBuilder::createKernelASM() { + llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); + std::string ErrMsg; + auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); + + if (!GPUTarget) { + errs() << ErrMsg << "\n"; + return ""; + } + + TargetOptions Options; + Options.UnsafeFPMath = FastMath; + std::unique_ptr<TargetMachine> TargetM( + GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", + Options, Optional<Reloc::Model>())); + + SmallString<0> ASMString; + raw_svector_ostream ASMStream(ASMString); + llvm::legacy::PassManager PM; + + PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); + + if (TargetM->addPassesToEmitFile( + PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { + errs() << "The target does not support generation of this file type!\n"; + return ""; + } + + PM.run(*GPUModule); + + return ASMStream.str(); +} + void GPUNodeBuilder::finalizeKernelFunction() { if (DumpKernelIR) outs() << *GPUModule << "\n"; + std::string Assembly = createKernelASM(); + + if (DumpKernelASM) + outs() << Assembly << "\n"; + GPUModule.release(); KernelIDs.clear(); } diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll index 2eee8aaaa90..0cea456e5a4 100644 --- a/polly/test/GPGPU/double-parallel-loop.ll +++ b/polly/test/GPGPU/double-parallel-loop.ll @@ -14,6 +14,10 @@ ; RUN: -disable-output < %s | \ ; RUN: FileCheck %s -check-prefix=KERNEL-IR +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \ +; RUN: -disable-output < %s | \ +; RUN: FileCheck %s -check-prefix=KERNEL-ASM + ; REQUIRES: pollyacc ; CHECK: Stmt_bb5 @@ -152,6 +156,16 @@ ; KERNEL-IR-NEXT: br label %polly.loop_header +; KERNEL-ASM: .version 3.2 +; KERNEL-ASM-NEXT: .target sm_30 +; KERNEL-ASM-NEXT: .address_size 64 + +; KERNEL-ASM: // .globl kernel_0 + +; KERNEL-ASM: .visible .entry kernel_0( +; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0 +; KERNEL-ASM-NEXT: ) + ; void double_parallel_loop(float A[][1024]) { ; for (long i = 0; i < 1024; i++) ; for (long j = 0; j < 1024; j++) diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll new file mode 100644 index 00000000000..568ac9c86cb --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-fp128.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "type mismatch between callee prototype and arguments" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @fp128(fp128* %A, fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0 + %tmp3 = load fp128, fp128* %tmp, align 4 + %tmp4 = fadd fp128 %tmp3, %b + store fp128 %tmp4, fp128* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll new file mode 100644 index 00000000000..e87b0cbb881 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i120.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "type mismatch between callee prototype and arguments" + +; void foo(i120 A[], i120 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i120(i120* %A, i120 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i120 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i120, i120* %A, i120 %i.0 + %tmp3 = load i120, i120* %tmp, align 4 + %tmp4 = add i120 %tmp3, %b + store i120 %tmp4, i120* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i120 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll new file mode 100644 index 00000000000..c852a0cb11a --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i128.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "LowerFormalArguments didn't emit the correct number of +; values!" + +; void foo(i128 A[], i128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i128(i128* %A, i128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i128 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i128, i128* %A, i128 %i.0 + %tmp3 = load i128, i128* %tmp, align 4 + %tmp4 = add i128 %tmp3, %b + store i128 %tmp4, i128* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i128 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll new file mode 100644 index 00000000000..ad7d17e4990 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i3000.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "LowerFormalArguments didn't emit the correct number of +; values!" + +; void foo(i3000 A[], i3000 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i3000(i3000* %A, i3000 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i3000 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i3000, i3000* %A, i3000 %i.0 + %tmp3 = load i3000, i3000* %tmp, align 4 + %tmp4 = add i3000 %tmp3, %b + store i3000 %tmp4, i3000* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i3000 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll new file mode 100644 index 00000000000..5662d736462 --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-i80.ll @@ -0,0 +1,40 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "LowerFormalArguments didn't emit the correct number of +; values!" + +; void foo(i80 A[], i80 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @i80(i80* %A, i80 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i80 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds i80, i80* %A, i80 %i.0 + %tmp3 = load i80, i80* %tmp, align 4 + %tmp4 = add i80 %tmp3, %b + store i80 %tmp4, i80* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i80 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll new file mode 100644 index 00000000000..fa78c5faccc --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll @@ -0,0 +1,38 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "type mismatch between callee prototype and arguments" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @ppc_fp128(ppc_fp128* %A, ppc_fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds ppc_fp128, ppc_fp128* %A, i64 %i.0 + %tmp3 = load ppc_fp128, ppc_fp128* %tmp, align 4 + %tmp4 = fadd ppc_fp128 %tmp3, %b + store ppc_fp128 %tmp4, ppc_fp128* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll new file mode 100644 index 00000000000..568ac9c86cb --- /dev/null +++ b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll @@ -0,0 +1,39 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s + +; XFAIL: * + +; REQUIRES: pollyacc + +; This fails today with "type mismatch between callee prototype and arguments" + +; void foo(fp128 A[], fp128 b) { +; for (long i = 0; i < 1024; i++) +; A[i] += b; +; } +; +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @fp128(fp128* %A, fp128 %b) { +bb: + br label %bb1 + +bb1: ; preds = %bb5, %bb + %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] + %exitcond = icmp ne i64 %i.0, 1024 + br i1 %exitcond, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0 + %tmp3 = load fp128, fp128* %tmp, align 4 + %tmp4 = fadd fp128 %tmp3, %b + store fp128 %tmp4, fp128* %tmp, align 4 + br label %bb5 + +bb5: ; preds = %bb2 + %tmp6 = add nuw nsw i64 %i.0, 1 + br label %bb1 + +bb7: ; preds = %bb1 + ret void +} + diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll index 871ced11e1a..38b38f4054e 100644 --- a/polly/test/GPGPU/scalar-parameter.ll +++ b/polly/test/GPGPU/scalar-parameter.ll @@ -158,156 +158,6 @@ bb7: ; preds = %bb1 ; CODE-NEXT: ==== ; CODE-NEXT: # host ; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(fp128), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(fp128), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(fp128), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(fp128 A[], fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @fp128(fp128* %A, fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0 - %tmp3 = load fp128, fp128* %tmp, align 4 - %tmp4 = fadd fp128 %tmp3, %b - store fp128 %tmp4, fp128* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(x86_fp80), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(x86_fp80), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(x86_fp80), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(x86_fp80 A[], x86_fp80 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @x86_fp80(x86_fp80* %A, x86_fp80 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %i.0 - %tmp3 = load x86_fp80, x86_fp80* %tmp, align 4 - %tmp4 = fadd x86_fp80 %tmp3, %b - store x86_fp80 %tmp4, x86_fp80* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(ppc_fp128), cudaMemcpyHostToDevice)); -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(ppc_fp128), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(ppc_fp128), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(ppc_fp128 A[], ppc_fp128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - -define void @ppc_fp128(ppc_fp128* %A, ppc_fp128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i64 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds ppc_fp128, ppc_fp128* %A, i64 %i.0 - %tmp3 = load ppc_fp128, ppc_fp128* %tmp, align 4 - %tmp4 = fadd ppc_fp128 %tmp3, %b - store ppc_fp128 %tmp4, ppc_fp128* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i64 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice)); ; CODE-NEXT: { ; CODE-NEXT: dim3 k0_dimBlock(32); @@ -597,199 +447,3 @@ bb5: ; preds = %bb2 bb7: ; preds = %bb1 ret void } - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i80), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i80), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i80 A[], i80 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i80:64-f80:128-n8:16:32:64-S128" - -define void @i80(i80* %A, i80 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i80 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i80, i80* %A, i80 %i.0 - %tmp3 = load i80, i80* %tmp, align 4 - %tmp4 = add i80 %tmp3, %b - store i80 %tmp4, i80* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i80 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i120), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i120), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i120 A[], i120 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i120:64-f80:128-n8:16:32:64-S128" - -define void @i120(i120* %A, i120 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i120 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i120, i120* %A, i120 %i.0 - %tmp3 = load i120, i120* %tmp, align 4 - %tmp4 = add i120 %tmp3, %b - store i120 %tmp4, i120* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i120 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i128), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i128), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i128 A[], i128 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i128:64-f80:128-n8:16:32:64-S128" - -define void @i128(i128* %A, i128 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i128 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i128, i128* %A, i128 %i.0 - %tmp3 = load i128, i128* %tmp, align 4 - %tmp4 = add i128 %tmp3, %b - store i128 %tmp4, i128* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i128 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} - -; CODE: Code -; CODE-NEXT: ==== -; CODE-NEXT: # host -; CODE-NEXT: { -; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3000), cudaMemcpyHostToDevice)); -; CODE-NEXT: { -; CODE-NEXT: dim3 k0_dimBlock(32); -; CODE-NEXT: dim3 k0_dimGrid(32); -; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A); -; CODE-NEXT: cudaCheckKernel(); -; CODE-NEXT: } - -; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3000), cudaMemcpyDeviceToHost)); -; CODE-NEXT: } - -; CODE: # kernel0 -; CODE-NEXT: Stmt_bb2(32 * b0 + t0); - -; void foo(i3000 A[], i3000 b) { -; for (long i = 0; i < 1024; i++) -; A[i] += b; -; } -; -target datalayout = "e-m:e-i3000:64-f80:128-n8:16:32:64-S128" - -define void @i3000(i3000* %A, i3000 %b) { -bb: - br label %bb1 - -bb1: ; preds = %bb5, %bb - %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ] - %exitcond = icmp ne i3000 %i.0, 1024 - br i1 %exitcond, label %bb2, label %bb7 - -bb2: ; preds = %bb1 - %tmp = getelementptr inbounds i3000, i3000* %A, i3000 %i.0 - %tmp3 = load i3000, i3000* %tmp, align 4 - %tmp4 = add i3000 %tmp3, %b - store i3000 %tmp4, i3000* %tmp, align 4 - br label %bb5 - -bb5: ; preds = %bb2 - %tmp6 = add nuw nsw i3000 %i.0, 1 - br label %bb1 - -bb7: ; preds = %bb1 - ret void -} |