summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--polly/lib/CodeGen/PPCGCodeGeneration.cpp123
-rw-r--r--polly/test/GPGPU/double-parallel-loop.ll14
-rw-r--r--polly/test/GPGPU/scalar-parameter-fp128.ll39
-rw-r--r--polly/test/GPGPU/scalar-parameter-i120.ll39
-rw-r--r--polly/test/GPGPU/scalar-parameter-i128.ll39
-rw-r--r--polly/test/GPGPU/scalar-parameter-i3000.ll39
-rw-r--r--polly/test/GPGPU/scalar-parameter-i80.ll40
-rw-r--r--polly/test/GPGPU/scalar-parameter-ppc_fp128.ll38
-rw-r--r--polly/test/GPGPU/scalar-parameter-x86_fp80.ll39
-rw-r--r--polly/test/GPGPU/scalar-parameter.ll346
10 files changed, 410 insertions, 346 deletions
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 8b4d2220297..74dd31d91fc 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -19,11 +19,18 @@
#include "polly/Options.h"
#include "polly/ScopInfo.h"
#include "polly/Support/SCEVValidator.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
#include "isl/union_map.h"
@@ -57,6 +64,21 @@ static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
cl::Hidden, cl::init(false), cl::ZeroOrMore,
cl::cat(PollyCategory));
+static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
+ cl::desc("Dump the kernel assembly code"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore,
+ cl::cat(PollyCategory));
+
+static cl::opt<bool> FastMath("polly-acc-fastmath",
+ cl::desc("Allow unsafe math optimizations"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore,
+ cl::cat(PollyCategory));
+
+static cl::opt<std::string>
+ CudaVersion("polly-acc-cuda-version",
+ cl::desc("The CUDA version to compile for"), cl::Hidden,
+ cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
+
/// Create the ast expressions for a ScopStmt.
///
/// This function is a callback for to generate the ast expressions for each
@@ -112,6 +134,12 @@ public:
}
private:
+ /// A vector of array base pointers for which a new ScopArrayInfo was created.
+ ///
+ /// This vector is used to delete the ScopArrayInfo when it is not needed any
+ /// more.
+ std::vector<Value *> LocalArrays;
+
/// A module containing GPU code.
///
/// This pointer is only set in case we are currently generating GPU code.
@@ -201,6 +229,26 @@ private:
/// Create an in-kernel synchronization call.
void createKernelSync();
+ /// Create a PTX assembly string for the current GPU kernel.
+ ///
+ /// @returns A string containing the corresponding PTX assembly code.
+ std::string createKernelASM();
+
+ /// Remove references from the dominator tree to the kernel function @p F.
+ ///
+ /// @param F The function to remove references to.
+ void clearDominators(Function *F);
+
+ /// Remove references from scalar evolution to the kernel function @p F.
+ ///
+ /// @param F The function to remove references to.
+ void clearScalarEvolution(Function *F);
+
+ /// Remove references from loop info to the kernel function @p F.
+ ///
+ /// @param F The function to remove references to.
+ void clearLoops(Function *F);
+
/// Finalize the generation of the kernel function.
///
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
@@ -360,6 +408,33 @@ SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
return SubtreeValues;
}
+void GPUNodeBuilder::clearDominators(Function *F) {
+ DomTreeNode *N = DT.getNode(&F->getEntryBlock());
+ std::vector<BasicBlock *> Nodes;
+ for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
+ Nodes.push_back(I->getBlock());
+
+ for (BasicBlock *BB : Nodes)
+ DT.eraseNode(BB);
+}
+
+void GPUNodeBuilder::clearScalarEvolution(Function *F) {
+ for (BasicBlock &BB : *F) {
+ Loop *L = LI.getLoopFor(&BB);
+ if (L)
+ SE.forgetLoop(L);
+ }
+}
+
+void GPUNodeBuilder::clearLoops(Function *F) {
+ for (BasicBlock &BB : *F) {
+ Loop *L = LI.getLoopFor(&BB);
+ if (L)
+ SE.forgetLoop(L);
+ LI.removeBlock(&BB);
+ }
+}
+
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
@@ -392,6 +467,11 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
create(isl_ast_node_copy(Kernel->tree));
+ Function *F = Builder.GetInsertBlock()->getParent();
+ clearDominators(F);
+ clearScalarEvolution(F);
+ clearLoops(F);
+
Builder.SetInsertPoint(&HostInsertPoint);
IDToValue = HostIDs;
@@ -400,6 +480,10 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
PHIOpMap.clear();
EscapeMap.clear();
IDToSAI.clear();
+ Annotator.resetAlternativeAliasBases();
+ for (auto &BasePtr : LocalArrays)
+ S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array);
+ LocalArrays.clear();
finalizeKernelFunction();
}
@@ -471,6 +555,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
}
const ScopArrayInfo *SAIRep =
S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array);
+ LocalArrays.push_back(Val);
isl_ast_build_free(Build);
isl_id_free(Id);
@@ -555,11 +640,49 @@ void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
insertKernelIntrinsics(Kernel);
}
+std::string GPUNodeBuilder::createKernelASM() {
+ llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda"));
+ std::string ErrMsg;
+ auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
+
+ if (!GPUTarget) {
+ errs() << ErrMsg << "\n";
+ return "";
+ }
+
+ TargetOptions Options;
+ Options.UnsafeFPMath = FastMath;
+ std::unique_ptr<TargetMachine> TargetM(
+ GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "",
+ Options, Optional<Reloc::Model>()));
+
+ SmallString<0> ASMString;
+ raw_svector_ostream ASMStream(ASMString);
+ llvm::legacy::PassManager PM;
+
+ PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
+
+ if (TargetM->addPassesToEmitFile(
+ PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
+ errs() << "The target does not support generation of this file type!\n";
+ return "";
+ }
+
+ PM.run(*GPUModule);
+
+ return ASMStream.str();
+}
+
void GPUNodeBuilder::finalizeKernelFunction() {
if (DumpKernelIR)
outs() << *GPUModule << "\n";
+ std::string Assembly = createKernelASM();
+
+ if (DumpKernelASM)
+ outs() << Assembly << "\n";
+
GPUModule.release();
KernelIDs.clear();
}
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
index 2eee8aaaa90..0cea456e5a4 100644
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@@ -14,6 +14,10 @@
; RUN: -disable-output < %s | \
; RUN: FileCheck %s -check-prefix=KERNEL-IR
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-asm \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck %s -check-prefix=KERNEL-ASM
+
; REQUIRES: pollyacc
; CHECK: Stmt_bb5
@@ -152,6 +156,16 @@
; KERNEL-IR-NEXT: br label %polly.loop_header
+; KERNEL-ASM: .version 3.2
+; KERNEL-ASM-NEXT: .target sm_30
+; KERNEL-ASM-NEXT: .address_size 64
+
+; KERNEL-ASM: // .globl kernel_0
+
+; KERNEL-ASM: .visible .entry kernel_0(
+; KERNEL-ASM-NEXT: .param .u64 kernel_0_param_0
+; KERNEL-ASM-NEXT: )
+
; void double_parallel_loop(float A[][1024]) {
; for (long i = 0; i < 1024; i++)
; for (long j = 0; j < 1024; j++)
diff --git a/polly/test/GPGPU/scalar-parameter-fp128.ll b/polly/test/GPGPU/scalar-parameter-fp128.ll
new file mode 100644
index 00000000000..568ac9c86cb
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-fp128.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "type mismatch between callee prototype and arguments"
+
+; void foo(fp128 A[], fp128 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @fp128(fp128* %A, fp128 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i64 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0
+ %tmp3 = load fp128, fp128* %tmp, align 4
+ %tmp4 = fadd fp128 %tmp3, %b
+ store fp128 %tmp4, fp128* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i64 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
+
diff --git a/polly/test/GPGPU/scalar-parameter-i120.ll b/polly/test/GPGPU/scalar-parameter-i120.ll
new file mode 100644
index 00000000000..e87b0cbb881
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-i120.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "type mismatch between callee prototype and arguments"
+
+; void foo(i120 A[], i120 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @i120(i120* %A, i120 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i120 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds i120, i120* %A, i120 %i.0
+ %tmp3 = load i120, i120* %tmp, align 4
+ %tmp4 = add i120 %tmp3, %b
+ store i120 %tmp4, i120* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i120 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
+
diff --git a/polly/test/GPGPU/scalar-parameter-i128.ll b/polly/test/GPGPU/scalar-parameter-i128.ll
new file mode 100644
index 00000000000..c852a0cb11a
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-i128.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "LowerFormalArguments didn't emit the correct number of
+; values!"
+
+; void foo(i128 A[], i128 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @i128(i128* %A, i128 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i128 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds i128, i128* %A, i128 %i.0
+ %tmp3 = load i128, i128* %tmp, align 4
+ %tmp4 = add i128 %tmp3, %b
+ store i128 %tmp4, i128* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i128 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
diff --git a/polly/test/GPGPU/scalar-parameter-i3000.ll b/polly/test/GPGPU/scalar-parameter-i3000.ll
new file mode 100644
index 00000000000..ad7d17e4990
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-i3000.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "LowerFormalArguments didn't emit the correct number of
+; values!"
+
+; void foo(i3000 A[], i3000 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @i3000(i3000* %A, i3000 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i3000 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds i3000, i3000* %A, i3000 %i.0
+ %tmp3 = load i3000, i3000* %tmp, align 4
+ %tmp4 = add i3000 %tmp3, %b
+ store i3000 %tmp4, i3000* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i3000 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
diff --git a/polly/test/GPGPU/scalar-parameter-i80.ll b/polly/test/GPGPU/scalar-parameter-i80.ll
new file mode 100644
index 00000000000..5662d736462
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-i80.ll
@@ -0,0 +1,40 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "LowerFormalArguments didn't emit the correct number of
+; values!"
+
+; void foo(i80 A[], i80 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @i80(i80* %A, i80 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i80 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds i80, i80* %A, i80 %i.0
+ %tmp3 = load i80, i80* %tmp, align 4
+ %tmp4 = add i80 %tmp3, %b
+ store i80 %tmp4, i80* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i80 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
+
diff --git a/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
new file mode 100644
index 00000000000..fa78c5faccc
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-ppc_fp128.ll
@@ -0,0 +1,38 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "type mismatch between callee prototype and arguments"
+
+; void foo(fp128 A[], fp128 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @ppc_fp128(ppc_fp128* %A, ppc_fp128 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i64 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds ppc_fp128, ppc_fp128* %A, i64 %i.0
+ %tmp3 = load ppc_fp128, ppc_fp128* %tmp, align 4
+ %tmp4 = fadd ppc_fp128 %tmp3, %b
+ store ppc_fp128 %tmp4, ppc_fp128* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i64 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
diff --git a/polly/test/GPGPU/scalar-parameter-x86_fp80.ll b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
new file mode 100644
index 00000000000..568ac9c86cb
--- /dev/null
+++ b/polly/test/GPGPU/scalar-parameter-x86_fp80.ll
@@ -0,0 +1,39 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code %s
+
+; XFAIL: *
+
+; REQUIRES: pollyacc
+
+; This fails today with "type mismatch between callee prototype and arguments"
+
+; void foo(fp128 A[], fp128 b) {
+; for (long i = 0; i < 1024; i++)
+; A[i] += b;
+; }
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @fp128(fp128* %A, fp128 %b) {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb5, %bb
+ %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
+ %exitcond = icmp ne i64 %i.0, 1024
+ br i1 %exitcond, label %bb2, label %bb7
+
+bb2: ; preds = %bb1
+ %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0
+ %tmp3 = load fp128, fp128* %tmp, align 4
+ %tmp4 = fadd fp128 %tmp3, %b
+ store fp128 %tmp4, fp128* %tmp, align 4
+ br label %bb5
+
+bb5: ; preds = %bb2
+ %tmp6 = add nuw nsw i64 %i.0, 1
+ br label %bb1
+
+bb7: ; preds = %bb1
+ ret void
+}
+
diff --git a/polly/test/GPGPU/scalar-parameter.ll b/polly/test/GPGPU/scalar-parameter.ll
index 871ced11e1a..38b38f4054e 100644
--- a/polly/test/GPGPU/scalar-parameter.ll
+++ b/polly/test/GPGPU/scalar-parameter.ll
@@ -158,156 +158,6 @@ bb7: ; preds = %bb1
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(fp128), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(fp128), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(fp128), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(fp128 A[], fp128 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @fp128(fp128* %A, fp128 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i64 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds fp128, fp128* %A, i64 %i.0
- %tmp3 = load fp128, fp128* %tmp, align 4
- %tmp4 = fadd fp128 %tmp3, %b
- store fp128 %tmp4, fp128* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i64 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(x86_fp80), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(x86_fp80), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(x86_fp80), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(x86_fp80 A[], x86_fp80 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @x86_fp80(x86_fp80* %A, x86_fp80 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i64 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %i.0
- %tmp3 = load x86_fp80, x86_fp80* %tmp, align 4
- %tmp4 = fadd x86_fp80 %tmp3, %b
- store x86_fp80 %tmp4, x86_fp80* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i64 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(ppc_fp128), cudaMemcpyHostToDevice));
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_b, &MemRef_b, sizeof(ppc_fp128), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A, dev_MemRef_b);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(ppc_fp128), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(ppc_fp128 A[], ppc_fp128 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @ppc_fp128(ppc_fp128* %A, ppc_fp128 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i64 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i64 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds ppc_fp128, ppc_fp128* %A, i64 %i.0
- %tmp3 = load ppc_fp128, ppc_fp128* %tmp, align 4
- %tmp4 = fadd ppc_fp128 %tmp3, %b
- store ppc_fp128 %tmp4, ppc_fp128* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i64 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i1), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
@@ -597,199 +447,3 @@ bb5: ; preds = %bb2
bb7: ; preds = %bb1
ret void
}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i80), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i80), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(i80 A[], i80 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i80:64-f80:128-n8:16:32:64-S128"
-
-define void @i80(i80* %A, i80 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i80 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i80 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds i80, i80* %A, i80 %i.0
- %tmp3 = load i80, i80* %tmp, align 4
- %tmp4 = add i80 %tmp3, %b
- store i80 %tmp4, i80* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i80 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i120), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i120), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(i120 A[], i120 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i120:64-f80:128-n8:16:32:64-S128"
-
-define void @i120(i120* %A, i120 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i120 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i120 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds i120, i120* %A, i120 %i.0
- %tmp3 = load i120, i120* %tmp, align 4
- %tmp4 = add i120 %tmp3, %b
- store i120 %tmp4, i120* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i120 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i128), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i128), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(i128 A[], i128 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i128:64-f80:128-n8:16:32:64-S128"
-
-define void @i128(i128* %A, i128 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i128 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i128 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds i128, i128* %A, i128 %i.0
- %tmp3 = load i128, i128* %tmp, align 4
- %tmp4 = add i128 %tmp3, %b
- store i128 %tmp4, i128* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i128 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
-
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i3000), cudaMemcpyHostToDevice));
-; CODE-NEXT: {
-; CODE-NEXT: dim3 k0_dimBlock(32);
-; CODE-NEXT: dim3 k0_dimGrid(32);
-; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_A);
-; CODE-NEXT: cudaCheckKernel();
-; CODE-NEXT: }
-
-; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i3000), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
-
-; CODE: # kernel0
-; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
-
-; void foo(i3000 A[], i3000 b) {
-; for (long i = 0; i < 1024; i++)
-; A[i] += b;
-; }
-;
-target datalayout = "e-m:e-i3000:64-f80:128-n8:16:32:64-S128"
-
-define void @i3000(i3000* %A, i3000 %b) {
-bb:
- br label %bb1
-
-bb1: ; preds = %bb5, %bb
- %i.0 = phi i3000 [ 0, %bb ], [ %tmp6, %bb5 ]
- %exitcond = icmp ne i3000 %i.0, 1024
- br i1 %exitcond, label %bb2, label %bb7
-
-bb2: ; preds = %bb1
- %tmp = getelementptr inbounds i3000, i3000* %A, i3000 %i.0
- %tmp3 = load i3000, i3000* %tmp, align 4
- %tmp4 = add i3000 %tmp3, %b
- store i3000 %tmp4, i3000* %tmp, align 4
- br label %bb5
-
-bb5: ; preds = %bb2
- %tmp6 = add nuw nsw i3000 %i.0, 1
- br label %bb1
-
-bb7: ; preds = %bb1
- ret void
-}
OpenPOWER on IntegriCloud