summaryrefslogtreecommitdiffstats
path: root/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'parallel-libs/streamexecutor/examples/CUDASaxpy.cpp')
-rw-r--r--parallel-libs/streamexecutor/examples/CUDASaxpy.cpp141
1 files changed, 0 insertions, 141 deletions
diff --git a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp b/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
deleted file mode 100644
index 6b2c59e5cd6..00000000000
--- a/parallel-libs/streamexecutor/examples/CUDASaxpy.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains example code demonstrating the usage of the
-/// StreamExecutor API. Snippets of this file will be included as code examples
-/// in documentation. Taking these examples from a real source file guarantees
-/// that the examples will always compile.
-///
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <vector>
-
-#include "streamexecutor/StreamExecutor.h"
-
-/// [Example saxpy compiler-generated]
-// Code in this namespace is generated by the compiler (e.g. clang).
-//
-// The name of this namespace may depend on the compiler that generated it, so
-// this is just an example name.
-namespace __compilergen {
-
-// Specialization of the streamexecutor::Kernel template class for the parameter
-// types of the saxpy(float A, float *X, float *Y) kernel.
-using SaxpyKernel =
- streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
- streamexecutor::GlobalDeviceMemory<float>>;
-
-// A string containing the PTX code generated by the device compiler for the
-// saxpy kernel. String contents not shown here.
-extern const char *SaxpyPTX;
-
-// A global instance of a loader spec that knows how to load the code in the
-// SaxpyPTX string.
-static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
- streamexecutor::MultiKernelLoaderSpec Spec;
- Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
- return Spec;
-}();
-
-} // namespace __compilergen
-/// [Example saxpy compiler-generated]
-
-/// [Example saxpy host PTX]
-// The PTX text for a saxpy kernel.
-const char *__compilergen::SaxpyPTX = R"(
- .version 4.3
- .target sm_20
- .address_size 64
-
- .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
- .reg .f32 %AValue;
- .reg .f32 %XValue;
- .reg .f32 %YValue;
- .reg .f32 %Result;
-
- .reg .b64 %XBaseAddrGeneric;
- .reg .b64 %YBaseAddrGeneric;
- .reg .b64 %XBaseAddrGlobal;
- .reg .b64 %YBaseAddrGlobal;
- .reg .b64 %XAddr;
- .reg .b64 %YAddr;
- .reg .b64 %ThreadByteOffset;
-
- .reg .b32 %TID;
-
- ld.param.f32 %AValue, [A];
- ld.param.u64 %XBaseAddrGeneric, [X];
- ld.param.u64 %YBaseAddrGeneric, [Y];
- cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
- cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
- mov.u32 %TID, %tid.x;
- mul.wide.u32 %ThreadByteOffset, %TID, 4;
- add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
- add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
- ld.global.f32 %XValue, [%XAddr];
- ld.global.f32 %YValue, [%YAddr];
- fma.rn.f32 %Result, %AValue, %XValue, %YValue;
- st.global.f32 [%XAddr], %Result;
- ret;
- }
-)";
-/// [Example saxpy host PTX]
-
-int main() {
- /// [Example saxpy host main]
- namespace se = ::streamexecutor;
- namespace cg = ::__compilergen;
-
- // Create some host data.
- float A = 42.0f;
- std::vector<float> HostX = {0, 1, 2, 3};
- std::vector<float> HostY = {4, 5, 6, 7};
- size_t ArraySize = HostX.size();
-
- // Get a device object.
- se::Platform *Platform =
- getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
- if (Platform->getDeviceCount() == 0) {
- return EXIT_FAILURE;
- }
- se::Device Device = getOrDie(Platform->getDevice(0));
-
- // Load the kernel onto the device.
- cg::SaxpyKernel Kernel =
- getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
-
- se::RegisteredHostMemory<float> RegisteredX =
- getOrDie(Device.registerHostMemory<float>(HostX));
- se::RegisteredHostMemory<float> RegisteredY =
- getOrDie(Device.registerHostMemory<float>(HostY));
-
- // Allocate memory on the device.
- se::GlobalDeviceMemory<float> X =
- getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
- se::GlobalDeviceMemory<float> Y =
- getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
-
- // Run operations on a stream.
- se::Stream Stream = getOrDie(Device.createStream());
- Stream.thenCopyH2D(RegisteredX, X)
- .thenCopyH2D(RegisteredY, Y)
- .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
- .thenCopyD2H(X, RegisteredX);
- // Wait for the stream to complete.
- se::dieIfError(Stream.blockHostUntilDone());
-
- // Process output data in HostX.
- std::vector<float> ExpectedX = {4, 47, 90, 133};
- assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
- /// [Example saxpy host main]
-}
OpenPOWER on IntegriCloud