summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/Driver/ToolChains.cpp4
-rw-r--r--clang/lib/Frontend/CompilerInvocation.cpp3
-rw-r--r--clang/lib/Frontend/InitPreprocessor.cpp6
-rw-r--r--clang/lib/Headers/__clang_cuda_runtime_wrapper.h14
4 files changed, 27 insertions, 0 deletions
diff --git a/clang/lib/Driver/ToolChains.cpp b/clang/lib/Driver/ToolChains.cpp
index 2f1420e943f..75aa9958028 100644
--- a/clang/lib/Driver/ToolChains.cpp
+++ b/clang/lib/Driver/ToolChains.cpp
@@ -4502,6 +4502,10 @@ CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
options::OPT_fno_cuda_flush_denormals_to_zero, false))
CC1Args.push_back("-fcuda-flush-denormals-to-zero");
+ if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
+ options::OPT_fno_cuda_approx_transcendentals, false))
+ CC1Args.push_back("-fcuda-approx-transcendentals");
+
if (DriverArgs.hasArg(options::OPT_nocudalib))
return;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index c5f839e6737..852801ab240 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1616,6 +1616,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
Opts.CUDADeviceFlushDenormalsToZero = 1;
+ if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
+ Opts.CUDADeviceApproxTranscendentals = 1;
+
if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
StringRef value = arg->getValue();
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 5d38d5f9503..f8b407ba264 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -938,6 +938,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__CUDA_ARCH__");
}
+ // We need to communicate this to our CUDA header wrapper, which in turn
+ // informs the proper CUDA headers of this choice.
+ if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
+ Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
+ }
+
// OpenCL definitions.
if (LangOpts.OpenCL) {
#define OPENCLEXT(Ext) \
diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
index 3e41eabac03..bce9d72af35 100644
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -142,7 +142,20 @@
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
#include "device_functions.hpp"
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__
+#endif
#include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
#include "math_functions_dbl_ptx3.hpp"
#pragma pop_macro("__forceinline__")
@@ -296,6 +309,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
#include "curand_mtgp32_kernel.h"
#pragma pop_macro("dim3")
#pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
OpenPOWER on IntegriCloud