//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// This file contains example code demonstrating the usage of the /// StreamExecutor API. Snippets of this file will be included as code examples /// in documentation. Taking these examples from a real source file guarantees /// that the examples will always compile. /// //===----------------------------------------------------------------------===// #include #include #include #include #include #include "streamexecutor/StreamExecutor.h" /// [Example saxpy compiler-generated] // Code in this namespace is generated by the compiler (e.g. clang). // // The name of this namespace may depend on the compiler that generated it, so // this is just an example name. namespace __compilergen { // Specialization of the streamexecutor::Kernel template class for the parameter // types of the saxpy(float A, float *X, float *Y) kernel. using SaxpyKernel = streamexecutor::Kernel, streamexecutor::GlobalDeviceMemory>; // A string containing the PTX code generated by the device compiler for the // saxpy kernel. String contents not shown here. extern const char *SaxpyPTX; // A global instance of a loader spec that knows how to load the code in the // SaxpyPTX string. static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { streamexecutor::MultiKernelLoaderSpec Spec; Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); return Spec; }(); } // namespace __compilergen /// [Example saxpy compiler-generated] /// [Example saxpy host PTX] // The PTX text for a saxpy kernel. const char *__compilergen::SaxpyPTX = R"( .version 4.3 .target sm_20 .address_size 64 .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { .reg .f32 %AValue; .reg .f32 %XValue; .reg .f32 %YValue; .reg .f32 %Result; .reg .b64 %XBaseAddrGeneric; .reg .b64 %YBaseAddrGeneric; .reg .b64 %XBaseAddrGlobal; .reg .b64 %YBaseAddrGlobal; .reg .b64 %XAddr; .reg .b64 %YAddr; .reg .b64 %ThreadByteOffset; .reg .b32 %TID; ld.param.f32 %AValue, [A]; ld.param.u64 %XBaseAddrGeneric, [X]; ld.param.u64 %YBaseAddrGeneric, [Y]; cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; mov.u32 %TID, %tid.x; mul.wide.u32 %ThreadByteOffset, %TID, 4; add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; ld.global.f32 %XValue, [%XAddr]; ld.global.f32 %YValue, [%YAddr]; fma.rn.f32 %Result, %AValue, %XValue, %YValue; st.global.f32 [%XAddr], %Result; ret; } )"; /// [Example saxpy host PTX] int main() { /// [Example saxpy host main] namespace se = ::streamexecutor; namespace cg = ::__compilergen; // Create some host data. float A = 42.0f; std::vector HostX = {0, 1, 2, 3}; std::vector HostY = {4, 5, 6, 7}; size_t ArraySize = HostX.size(); // Get a device object. se::Platform *Platform = getOrDie(se::PlatformManager::getPlatformByName("CUDA")); if (Platform->getDeviceCount() == 0) { return EXIT_FAILURE; } se::Device *Device = getOrDie(Platform->getDevice(0)); // Load the kernel onto the device. cg::SaxpyKernel Kernel = getOrDie(Device->createKernel(cg::SaxpyLoaderSpec)); se::RegisteredHostMemory RegisteredX = getOrDie(Device->registerHostMemory(HostX)); se::RegisteredHostMemory RegisteredY = getOrDie(Device->registerHostMemory(HostY)); // Allocate memory on the device. se::GlobalDeviceMemory X = getOrDie(Device->allocateDeviceMemory(ArraySize)); se::GlobalDeviceMemory Y = getOrDie(Device->allocateDeviceMemory(ArraySize)); // Run operations on a stream. se::Stream Stream = getOrDie(Device->createStream()); Stream.thenCopyH2D(RegisteredX, X) .thenCopyH2D(RegisteredY, Y) .thenLaunch(ArraySize, 1, Kernel, A, X, Y) .thenCopyD2H(X, RegisteredX); // Wait for the stream to complete. se::dieIfError(Stream.blockHostUntilDone()); // Process output data in HostX. std::vector ExpectedX = {4, 47, 90, 133}; assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin())); /// [Example saxpy host main] }