diff options
author | Artem Belevich <tra@google.com> | 2015-07-13 23:27:56 +0000 |
---|---|---|
committer | Artem Belevich <tra@google.com> | 2015-07-13 23:27:56 +0000 |
commit | 0ff05cd1651474678e8f503cae4956f0c342bf67 (patch) | |
tree | 98c608f18a4fd854516da066943e603c71280988 /clang | |
parent | 2eacca86ef2c255bae8eff43056dfa6c57ae7092 (diff) | |
download | bcm5719-llvm-0ff05cd1651474678e8f503cae4956f0c342bf67.tar.gz bcm5719-llvm-0ff05cd1651474678e8f503cae4956f0c342bf67.zip |
[cuda] Driver changes to compile and stitch together host and device-side CUDA code.
NOTE: reverts r242077 to reinstate r242058, r242065, 242067
and includes fix for OS X test failures.
- Changed driver pipeline to compile host and device side of CUDA
files and incorporate results of device-side compilation into host
object file.
- Added a test for cuda pipeline creation in clang driver.
New clang options:
--cuda-host-only - Do host-side compilation only.
--cuda-device-only - Do device-side compilation only.
--cuda-gpu-arch=<ARCH> - specify GPU architecture for device-side
compilation. E.g. sm_35, sm_30. Default is sm_20. May be used more
than once in which case one device-compilation will be done per
unique specified GPU architecture.
Differential Revision: http://reviews.llvm.org/D9509
llvm-svn: 242085
Diffstat (limited to 'clang')
-rw-r--r-- | clang/include/clang/Driver/Action.h | 37 | ||||
-rw-r--r-- | clang/include/clang/Driver/Options.td | 6 | ||||
-rw-r--r-- | clang/include/clang/Driver/Types.def | 1 | ||||
-rw-r--r-- | clang/include/clang/Driver/Types.h | 3 | ||||
-rw-r--r-- | clang/lib/Driver/Action.cpp | 21 | ||||
-rw-r--r-- | clang/lib/Driver/Driver.cpp | 178 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChain.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains.cpp | 59 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains.h | 12 | ||||
-rw-r--r-- | clang/lib/Driver/Tools.cpp | 36 | ||||
-rw-r--r-- | clang/lib/Driver/Types.cpp | 21 | ||||
-rw-r--r-- | clang/lib/Frontend/CreateInvocationFromCommandLine.cpp | 21 | ||||
-rw-r--r-- | clang/test/Driver/cuda-options.cu | 109 | ||||
-rw-r--r-- | clang/test/Index/attributes-cuda.cu | 22 | ||||
-rw-r--r-- | clang/test/Index/index-file.cu | 9 | ||||
-rw-r--r-- | clang/tools/libclang/CIndex.cpp | 6 | ||||
-rw-r--r-- | clang/unittests/ASTMatchers/ASTMatchersTest.h | 1 |
17 files changed, 516 insertions, 28 deletions
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h index 847c994a584..fddd15885e1 100644 --- a/clang/include/clang/Driver/Action.h +++ b/clang/include/clang/Driver/Action.h @@ -41,6 +41,8 @@ public: enum ActionClass { InputClass = 0, BindArchClass, + CudaDeviceClass, + CudaHostClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -133,6 +135,41 @@ public: } }; +class CudaDeviceAction : public Action { + virtual void anchor(); + /// GPU architecture to bind -- e.g 'sm_35'. + const char *GpuArchName; + /// True when action results are not consumed by the host action (e.g when + /// -fsyntax-only or --cuda-device-only options are used). + bool AtTopLevel; + +public: + CudaDeviceAction(std::unique_ptr<Action> Input, const char *ArchName, + bool AtTopLevel); + + const char *getGpuArchName() const { return GpuArchName; } + bool isAtTopLevel() const { return AtTopLevel; } + + static bool classof(const Action *A) { + return A->getKind() == CudaDeviceClass; + } +}; + +class CudaHostAction : public Action { + virtual void anchor(); + ActionList DeviceActions; + +public: + CudaHostAction(std::unique_ptr<Action> Input, + const ActionList &DeviceActions); + ~CudaHostAction() override; + + ActionList &getDeviceActions() { return DeviceActions; } + const ActionList &getDeviceActions() const { return DeviceActions; } + + static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } +}; + class JobAction : public Action { virtual void anchor(); protected: diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 89180b8f540..781cc352547 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -351,6 +351,12 @@ def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>, MetaVarName<"<directory>">; def c : Flag<["-"], "c">, Flags<[DriverOption]>, HelpText<"Only run preprocess, compile, and assemble steps">; +def cuda_device_only : Flag<["--"], "cuda-device-only">, + HelpText<"Do device-side CUDA compilation only">; +def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, + Flags<[DriverOption, HelpHidden]>, HelpText<"CUDA GPU architecture">; +def cuda_host_only : Flag<["--"], "cuda-host-only">, + HelpText<"Do host-side CUDA compilation only">; def dA : Flag<["-"], "dA">, Group<d_Group>; def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; diff --git a/clang/include/clang/Driver/Types.def b/clang/include/clang/Driver/Types.def index 4b696ae5e05..d1b69151b06 100644 --- a/clang/include/clang/Driver/Types.def +++ b/clang/include/clang/Driver/Types.def @@ -44,6 +44,7 @@ TYPE("c", C, PP_C, "c", "u") TYPE("cl", CL, PP_C, "cl", "u") TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u") TYPE("cuda", CUDA, PP_CUDA, "cu", "u") +TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "") TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u") TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u") TYPE("objective-c", ObjC, PP_ObjC, "m", "u") diff --git a/clang/include/clang/Driver/Types.h b/clang/include/clang/Driver/Types.h index 34442eb6379..dd95d659915 100644 --- a/clang/include/clang/Driver/Types.h +++ b/clang/include/clang/Driver/Types.h @@ -63,6 +63,9 @@ namespace types { /// isCXX - Is this a "C++" input (C++ and Obj-C++ sources and headers). bool isCXX(ID Id); + /// isCuda - Is this a CUDA input. + bool isCuda(ID Id); + /// isObjC - Is this an "ObjC" input (Obj-C and Obj-C++ sources and headers). bool isObjC(ID Id); diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index 360dbeecabf..3219dc1cc0e 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -24,6 +24,8 @@ const char *Action::getClassName(ActionClass AC) { switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; + case CudaDeviceClass: return "cuda-device"; + case CudaHostClass: return "cuda-host"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -53,6 +55,25 @@ BindArchAction::BindArchAction(std::unique_ptr<Action> Input, const char *_ArchName) : Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {} +void CudaDeviceAction::anchor() {} + +CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input, + const char *ArchName, bool AtTopLevel) + : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName), + AtTopLevel(AtTopLevel) {} + +void CudaHostAction::anchor() {} + +CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input, + const ActionList &_DeviceActions) + : Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {} + +CudaHostAction::~CudaHostAction() { + for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie; + ++it) + delete *it; +} + void JobAction::anchor() {} JobAction::JobAction(ActionClass Kind, std::unique_ptr<Action> Input, diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index f3ec151ec94..180c412bd79 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -174,8 +174,10 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL, } else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) { FinalPhase = phases::Backend; - // -c only runs up to the assembler. - } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) { + // -c and partial CUDA compilations only run up to the assembler. + } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) || + (PhaseArg = DAL.getLastArg(options::OPT_cuda_device_only)) || + (PhaseArg = DAL.getLastArg(options::OPT_cuda_host_only))) { FinalPhase = phases::Assemble; // Otherwise do everything. @@ -900,9 +902,20 @@ static unsigned PrintActions1(const Compilation &C, Action *A, } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}"; + } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { + os << '"' << CDA->getGpuArchName() << '"' << ", {" + << PrintActions1(C, *CDA->begin(), Ids) << "}"; } else { + ActionList *AL; + if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) { + os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}" + << ", gpu binaries "; + AL = &CHA->getDeviceActions(); + } else + AL = &A->getInputs(); + const char *Prefix = "{"; - for (Action *PreRequisite : *A) { + for (Action *PreRequisite : *AL) { os << Prefix << PrintActions1(C, PreRequisite, Ids); Prefix = ", "; } @@ -1215,6 +1228,93 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args, } } +// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input +// action and then wraps each in CudaDeviceAction paired with appropriate GPU +// arch name. If we're only building device-side code, each action remains +// independent. Otherwise we pass device-side actions as inputs to a new +// CudaHostAction which combines both host and device side actions. +static std::unique_ptr<Action> +buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args, + const Arg *InputArg, const types::ID InputType, + std::unique_ptr<Action> Current, ActionList &Actions) { + + assert(InputType == types::TY_CUDA && + "CUDA Actions only apply to CUDA inputs."); + + // Collect all cuda_gpu_arch parameters, removing duplicates. + SmallVector<const char *, 4> GpuArchList; + llvm::StringSet<> GpuArchNames; + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) { + A->claim(); + if (GpuArchNames.insert(A->getValue()).second) + GpuArchList.push_back(A->getValue()); + } + } + + // Default to sm_20 which is the lowest common denominator for supported GPUs. + // sm_20 code should work correctly, if suboptimally, on all newer GPUs. + if (GpuArchList.empty()) + GpuArchList.push_back("sm_20"); + + // Replicate inputs for each GPU architecture. + Driver::InputList CudaDeviceInputs; + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); + + // Build actions for all device inputs. + ActionList CudaDeviceActions; + D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions); + assert(GpuArchList.size() == CudaDeviceActions.size() && + "Failed to create actions for all devices"); + + // Check whether any of device actions stopped before they could generate PTX. + bool PartialCompilation = false; + bool DeviceOnlyCompilation = Args.hasArg(options::OPT_cuda_device_only); + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) { + if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) { + PartialCompilation = true; + break; + } + } + + // Figure out what to do with device actions -- pass them as inputs to the + // host action or run each of them independently. + if (PartialCompilation || DeviceOnlyCompilation) { + // In case of partial or device-only compilation results of device actions + // are not consumed by the host action device actions have to be added to + // top-level actions list with AtTopLevel=true and run independently. + + // -o is ambiguous if we have more than one top-level action. + if (Args.hasArg(options::OPT_o) && + (!DeviceOnlyCompilation || GpuArchList.size() > 1)) { + D.Diag(clang::diag::err_drv_output_argument_with_multiple_files); + return nullptr; + } + + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + Actions.push_back( + new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]), + GpuArchList[i], /* AtTopLevel */ true)); + // Kill host action in case of device-only compilation. + if (DeviceOnlyCompilation) + Current.reset(nullptr); + return Current; + } else { + // Outputs of device actions during complete CUDA compilation get created + // with AtTopLevel=false and become inputs for the host action. + ActionList DeviceActions; + for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) + DeviceActions.push_back( + new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]), + GpuArchList[i], /* AtTopLevel */ false)); + // Return a new host action that incorporates original host action and all + // device actions. + return std::unique_ptr<Action>( + new CudaHostAction(std::move(Current), DeviceActions)); + } +} + void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { llvm::PrettyStackTraceString CrashInfo("Building compilation actions"); @@ -1312,6 +1412,25 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args, continue; } + phases::ID CudaInjectionPhase; + if (isSaveTempsEnabled()) { + // All phases are done independently, inject GPU blobs during compilation + // phase as that's where we generate glue code to init them. + CudaInjectionPhase = phases::Compile; + } else { + // Assumes that clang does everything up until linking phase, so we inject + // cuda device actions at the last step before linking. Otherwise CUDA + // host action forces preprocessor into a separate invocation. + if (FinalPhase == phases::Link) { + for (auto i = PL.begin(), e = PL.end(); i != e; ++i) { + auto next = i + 1; + if (next != e && *next == phases::Link) + CudaInjectionPhase = *i; + } + } else + CudaInjectionPhase = FinalPhase; + } + // Build the pipeline for this file. std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType)); for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end(); @@ -1337,6 +1456,15 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args, // Otherwise construct the appropriate action. Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current)); + + if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase && + !Args.hasArg(options::OPT_cuda_host_only)) { + Current = buildCudaActions(*this, TC, Args, InputArg, InputType, + std::move(Current), Actions); + if (!Current) + break; + } + if (Current->getType() == types::TY_Nothing) break; } @@ -1576,7 +1704,13 @@ static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps, if (isa<BackendJobAction>(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin()); + JobAction *CompileJA; + // Extract real host action, if it's a CudaHostAction. + if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin())) + CompileJA = cast<CompileJobAction>(*CudaHA->begin()); + else + CompileJA = cast<CompileJobAction>(*Inputs->begin()); + const Tool *Compiler = TC->SelectTool(*CompileJA); if (!Compiler) return nullptr; @@ -1610,6 +1744,20 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A, InputInfo &Result) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); + InputInfoList CudaDeviceInputInfos; + if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) { + InputInfo II; + // Append outputs of device jobs to the input list. + for (const Action *DA : CHA->getDeviceActions()) { + BuildJobsForAction(C, DA, TC, "", AtTopLevel, + /*MultipleArchs*/ false, LinkingOutput, II); + CudaDeviceInputInfos.push_back(II); + } + // Override current action with a real host compile action and continue + // processing it. + A = *CHA->begin(); + } + if (const InputAction *IA = dyn_cast<InputAction>(A)) { // FIXME: It would be nice to not claim this here; maybe the old scheme of // just using Args was better? @@ -1635,11 +1783,24 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A, else TC = &C.getDefaultToolChain(); - BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(), AtTopLevel, + BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel, MultipleArchs, LinkingOutput, Result); return; } + if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { + // Figure out which NVPTX triple to use for device-side compilation based on + // whether host is 64-bit. + llvm::Triple DeviceTriple(C.getDefaultToolChain().getTriple().isArch64Bit() + ? "nvptx64-nvidia-cuda" + : "nvptx-nvidia-cuda"); + BuildJobsForAction(C, *CDA->begin(), + &getToolChain(C.getArgs(), DeviceTriple), + CDA->getGpuArchName(), CDA->isAtTopLevel(), + /*MultipleArchs*/ true, LinkingOutput, Result); + return; + } + const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast<JobAction>(A); @@ -1671,6 +1832,10 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A, if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); + // Append outputs of cuda device jobs to the input list + if (CudaDeviceInputInfos.size()) + InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Determine the place to write output to, if any. if (JA->getType() == types::TY_Nothing) Result = InputInfo(A->getType(), BaseInput); @@ -2052,6 +2217,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args, break; } break; + case llvm::Triple::CUDA: + TC = new toolchains::CudaToolChain(*this, Target, Args); + break; default: // Of these targets, Hexagon is the only one that might have // an OS of Linux, in which case it got handled above already. diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index da020a2b8c9..e6a1bc9685d 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -151,6 +151,8 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const { case Action::InputClass: case Action::BindArchClass: + case Action::CudaDeviceClass: + case Action::CudaHostClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: diff --git a/clang/lib/Driver/ToolChains.cpp b/clang/lib/Driver/ToolChains.cpp index eafc72b6b12..15e36a1e6ce 100644 --- a/clang/lib/Driver/ToolChains.cpp +++ b/clang/lib/Driver/ToolChains.cpp @@ -3652,6 +3652,65 @@ Tool *DragonFly::buildLinker() const { return new tools::dragonfly::Linker(*this); } +/// Stub for CUDA toolchain. At the moment we don't have assembler or +/// linker and need toolchain mainly to propagate device-side options +/// to CC1. + +CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, + const ArgList &Args) + : Linux(D, Triple, Args) {} + +void +CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const { + Linux::addClangTargetOptions(DriverArgs, CC1Args); + CC1Args.push_back("-fcuda-is-device"); +} + +llvm::opt::DerivedArgList * +CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, + const char *BoundArch) const { + DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs()); + const OptTable &Opts = getDriver().getOpts(); + + for (Arg *A : Args) { + if (A->getOption().matches(options::OPT_Xarch__)) { + // Skip this argument unless the architecture matches BoundArch + if (A->getValue(0) != StringRef(BoundArch)) + continue; + + unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); + unsigned Prev = Index; + std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index)); + + // If the argument parsing failed or more than one argument was + // consumed, the -Xarch_ argument's parameter tried to consume + // extra arguments. Emit an error and ignore. + // + // We also want to disallow any options which would alter the + // driver behavior; that isn't going to work in our model. We + // use isDriverOption() as an approximation, although things + // like -O4 are going to slip through. + if (!XarchArg || Index > Prev + 1) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) + << A->getAsString(Args); + continue; + } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) + << A->getAsString(Args); + continue; + } + XarchArg->setBaseArg(A); + A = XarchArg.release(); + DAL->AddSynthesizedArg(A); + } + DAL->append(A); + } + + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); + return DAL; +} + /// XCore tool chain XCore::XCore(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) : ToolChain(D, Triple, Args) { diff --git a/clang/lib/Driver/ToolChains.h b/clang/lib/Driver/ToolChains.h index 36896820447..327ff9b13b1 100644 --- a/clang/lib/Driver/ToolChains.h +++ b/clang/lib/Driver/ToolChains.h @@ -699,6 +699,18 @@ private: std::string computeSysRoot() const; }; +class LLVM_LIBRARY_VISIBILITY CudaToolChain : public Linux { +public: + CudaToolChain(const Driver &D, const llvm::Triple &Triple, + const llvm::opt::ArgList &Args); + + llvm::opt::DerivedArgList * + TranslateArgs(const llvm::opt::DerivedArgList &Args, + const char *BoundArch) const override; + void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; +}; + class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux { protected: GCCVersion GCCLibAndIncVersion; diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp index bf9b4baeb3b..c6dc178849a 100644 --- a/clang/lib/Driver/Tools.cpp +++ b/clang/lib/Driver/Tools.cpp @@ -1488,6 +1488,12 @@ static std::string getCPUName(const ArgList &Args, const llvm::Triple &T) { return CPUName; } + case llvm::Triple::nvptx: + case llvm::Triple::nvptx64: + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) + return A->getValue(); + return ""; + case llvm::Triple::ppc: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: { @@ -2826,8 +2832,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, getToolChain().getTriple().isWindowsCygwinEnvironment(); bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment(); - assert(Inputs.size() == 1 && "Unable to handle multiple inputs."); + // Check number of inputs for sanity. We need at least one input. + assert(Inputs.size() >= 1 && "Must have at least one input."); const InputInfo &Input = Inputs[0]; + // CUDA compilation may have multiple inputs (source file + results of + // device-side compilations). All other jobs are expected to have exactly one + // input. + bool IsCuda = types::isCuda(Input.getType()); + assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. // @@ -4812,14 +4824,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, assert(Output.isNothing() && "Invalid output."); } - for (const auto &II : Inputs) { - addDashXForInput(Args, II, CmdArgs); + addDashXForInput(Args, Input, CmdArgs); - if (II.isFilename()) - CmdArgs.push_back(II.getFilename()); - else - II.getInputArg().renderAsInput(Args, CmdArgs); - } + if (Input.isFilename()) + CmdArgs.push_back(Input.getFilename()); + else + Input.getInputArg().renderAsInput(Args, CmdArgs); Args.AddAllArgs(CmdArgs, options::OPT_undef); @@ -4857,6 +4867,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back(SplitDwarfOut); } + // Host-side cuda compilation receives device-side outputs as Inputs[1...]. + // Include them with -fcuda-include-gpubinary. + if (IsCuda && Inputs.size() > 1) + for (InputInfoList::const_iterator it = std::next(Inputs.begin()), + ie = Inputs.end(); + it != ie; ++it) { + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(it->getFilename()); + } + // Finally add the compile command to the compilation. if (Args.hasArg(options::OPT__SLASH_fallback) && Output.getType() == types::TY_Object && diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp index 7b281457557..2085b012429 100644 --- a/clang/lib/Driver/Types.cpp +++ b/clang/lib/Driver/Types.cpp @@ -86,6 +86,7 @@ bool types::isAcceptedByClang(ID Id) { case TY_C: case TY_PP_C: case TY_CL: case TY_CUDA: case TY_PP_CUDA: + case TY_CUDA_DEVICE: case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias: case TY_CXX: case TY_PP_CXX: case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: @@ -122,7 +123,19 @@ bool types::isCXX(ID Id) { case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias: case TY_CXXHeader: case TY_PP_CXXHeader: case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader: - case TY_CUDA: case TY_PP_CUDA: + case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE: + return true; + } +} + +bool types::isCuda(ID Id) { + switch (Id) { + default: + return false; + + case TY_CUDA: + case TY_PP_CUDA: + case TY_CUDA_DEVICE: return true; } } @@ -206,10 +219,12 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) { P.push_back(phases::Compile); P.push_back(phases::Backend); } - P.push_back(phases::Assemble); + if (Id != TY_CUDA_DEVICE) + P.push_back(phases::Assemble); } } - if (!onlyPrecompileType(Id)) { + + if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) { P.push_back(phases::Link); } assert(0 < P.size() && "Not enough phases in list"); diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp index 4a8a8a029e7..2afd23fcb9e 100644 --- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp +++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -15,6 +15,7 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Action.h" #include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" @@ -61,9 +62,25 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList, } // We expect to get back exactly one command job, if we didn't something - // failed. + // failed. CUDA compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs particular + // CUDA job, it should be controlled via --cuda-{host|device}-only option + // passed to the driver. const driver::JobList &Jobs = C->getJobs(); - if (Jobs.size() != 1 || !isa<driver::Command>(*Jobs.begin())) { + bool CudaCompilation = false; + if (Jobs.size() > 1) { + for (auto &A : C->getActions()){ + // On MacOSX real actions may end up being wrapped in BindArchAction + if (isa<driver::BindArchAction>(A)) + A = *A->begin(); + if (isa<driver::CudaDeviceAction>(A)) { + CudaCompilation = true; + break; + } + } + } + if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) || + (Jobs.size() > 1 && !CudaCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true); diff --git a/clang/test/Driver/cuda-options.cu b/clang/test/Driver/cuda-options.cu new file mode 100644 index 00000000000..ecb3f228eab --- /dev/null +++ b/clang/test/Driver/cuda-options.cu @@ -0,0 +1,109 @@ +// Tests CUDA compilation pipeline construction in Driver. +// REQUIRES: clang-driver + +// Simple compilation case: +// RUN: %clang -### -c %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side. +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Typical compilation + link case: +// RUN: %clang -### %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Then link things. +// RUN: -check-prefix CUDA-L %s + +// Verify that -cuda-no-device disables device-side compilation and linking +// RUN: %clang -### --cuda-host-only %s 2>&1 \ +// Make sure we didn't run device-side compilation. +// RUN: | FileCheck -check-prefix CUDA-ND \ +// Then compile host side and make sure we don't attempt to incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-NI \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that -cuda-no-host disables host-side compilation and linking +// RUN: %clang -### --cuda-device-only %s 2>&1 \ +// Compile device-side to PTX assembly +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Make sure there are no host cmpilation or linking. +// RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s + +// Verify that with -S we compile host and device sides to assembly +// and incorporate device code on the host side. +// RUN: %clang -### -S -c %s 2>&1 \ +// Compile device-side to PTX assembly +// RUN: | FileCheck -check-prefix CUDA-D1 \ +// Then compile host side and incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that --cuda-gpu-arch option passes correct GPU +// archtecture info to device compilation. +// RUN: %clang -### --cuda-gpu-arch=sm_35 -c %s 2>&1 \ +// Compile device-side to PTX assembly. +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// Then compile host side and incorporate GPU code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that there is device-side compilation per --cuda-gpu-arch args +// and that all results are included on the host side. +// RUN: %clang -### --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \ +// Compile both device-sides to PTX assembly +// RUN: | FileCheck \ +// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \ +// Then compile host side and incorporate both device-side outputs +// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Match device-side compilation +// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-D1-SAME: "-fcuda-is-device" +// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35" +// CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]" +// CUDA-D1-SAME: "-x" "cuda" + +// Match anothe device-side compilation +// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-D2-SAME: "-fcuda-is-device" +// CUDA-D2-SM30-SAME: "-target-cpu" "sm_30" +// CUDA-D2-SAME: "-o" "[[GPUBINARY2:[^"]*]]" +// CUDA-D2-SAME: "-x" "cuda" + +// Match no device-side compilation +// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda" +// CUDA-ND-SAME-NOT: "-fcuda-is-device" + +// Match host-side compilation +// CUDA-H: "-cc1" "-triple" +// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda" +// CUDA-H-SAME-NOT: "-fcuda-is-device" +// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]" +// CUDA-H-SAME: "-x" "cuda" +// CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]" +// CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]" + +// Match no GPU code inclusion. +// CUDA-H-NI-NOT: "-fcuda-include-gpubinary" + +// Match no CUDA compilation +// CUDA-NH-NOT: "-cc1" "-triple" +// CUDA-NH-SAME-NOT: "-x" "cuda" + +// Match linker +// CUDA-L: "{{.*}}ld{{(.exe)?}}" +// CUDA-L-SAME: "[[HOSTOBJ]]" + +// Match no linker +// CUDA-NL-NOT: "{{.*}}ld{{(.exe)?}}" diff --git a/clang/test/Index/attributes-cuda.cu b/clang/test/Index/attributes-cuda.cu index 824bdb4c883..51f4aedd198 100644 --- a/clang/test/Index/attributes-cuda.cu +++ b/clang/test/Index/attributes-cuda.cu @@ -1,4 +1,6 @@ // RUN: c-index-test -test-load-source all -x cuda %s | FileCheck %s +// RUN: c-index-test -test-load-source all -x cuda --cuda-host-only %s | FileCheck %s +// RUN: c-index-test -test-load-source all -x cuda --cuda-device-only %s | FileCheck %s __attribute__((device)) void f_device(); __attribute__((global)) void f_global(); @@ -6,13 +8,13 @@ __attribute__((constant)) int* g_constant; __attribute__((shared)) float *g_shared; __attribute__((host)) void f_host(); -// CHECK: attributes-cuda.cu:3:30: FunctionDecl=f_device:3:30 -// CHECK-NEXT: attributes-cuda.cu:3:16: attribute(device) -// CHECK: attributes-cuda.cu:4:30: FunctionDecl=f_global:4:30 -// CHECK-NEXT: attributes-cuda.cu:4:16: attribute(global) -// CHECK: attributes-cuda.cu:5:32: VarDecl=g_constant:5:32 (Definition) -// CHECK-NEXT: attributes-cuda.cu:5:16: attribute(constant) -// CHECK: attributes-cuda.cu:6:32: VarDecl=g_shared:6:32 (Definition) -// CHECK-NEXT: attributes-cuda.cu:6:16: attribute(shared) -// CHECK: attributes-cuda.cu:7:28: FunctionDecl=f_host:7:28 -// CHECK-NEXT: attributes-cuda.cu:7:16: attribute(host) +// CHECK: attributes-cuda.cu:5:30: FunctionDecl=f_device:5:30 +// CHECK-NEXT: attributes-cuda.cu:5:16: attribute(device) +// CHECK: attributes-cuda.cu:6:30: FunctionDecl=f_global:6:30 +// CHECK-NEXT: attributes-cuda.cu:6:16: attribute(global) +// CHECK: attributes-cuda.cu:7:32: VarDecl=g_constant:7:32 (Definition) +// CHECK-NEXT: attributes-cuda.cu:7:16: attribute(constant) +// CHECK: attributes-cuda.cu:8:32: VarDecl=g_shared:8:32 (Definition) +// CHECK-NEXT: attributes-cuda.cu:8:16: attribute(shared) +// CHECK: attributes-cuda.cu:9:28: FunctionDecl=f_host:9:28 +// CHECK-NEXT: attributes-cuda.cu:9:16: attribute(host) diff --git a/clang/test/Index/index-file.cu b/clang/test/Index/index-file.cu new file mode 100644 index 00000000000..26b93f06945 --- /dev/null +++ b/clang/test/Index/index-file.cu @@ -0,0 +1,9 @@ +// Make sure we can process CUDA file even if driver creates multiple jobs +// RUN: c-index-test -test-load-source all %s | FileCheck %s -check-prefix=CHECK-ANY +// Make sure we process correct side of cuda compilation +// RUN: c-index-test -test-load-source all --cuda-host-only %s | FileCheck %s -check-prefix=CHECK-HOST +// RUN: c-index-test -test-load-source all --cuda-device-only %s | FileCheck %s -check-prefix=CHECK-DEVICE + +// CHECK-ANY: macro definition=__cplusplus +// CHECK-HOST-NOT: macro definition=__CUDA_ARCH__ +// CHECK-DEVICE: macro definition=__CUDA_ARCH__ diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 49b9da23865..6e48bab9f50 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -3102,6 +3102,12 @@ static void clang_parseTranslationUnit_Impl(void *UserData) { /*AllowPCHWithCompilerErrors=*/true, SkipFunctionBodies, /*UserFilesAreVolatile=*/true, ForSerialization, &ErrUnit)); + // Early failures in LoadFromCommandLine may return with ErrUnit unset. + if (!Unit && !ErrUnit) { + PTUI->result = CXError_ASTReadError; + return; + } + if (NumErrors != Diags->getClient()->getNumErrors()) { // Make sure to check that 'Unit' is non-NULL. if (CXXIdx->getDisplayDiagnostics()) diff --git a/clang/unittests/ASTMatchers/ASTMatchersTest.h b/clang/unittests/ASTMatchers/ASTMatchersTest.h index 17b69dac45a..d79a3175b47 100644 --- a/clang/unittests/ASTMatchers/ASTMatchersTest.h +++ b/clang/unittests/ASTMatchers/ASTMatchersTest.h @@ -164,6 +164,7 @@ testing::AssertionResult matchesConditionallyWithCuda( std::vector<std::string> Args; Args.push_back("-xcuda"); Args.push_back("-fno-ms-extensions"); + Args.push_back("--cuda-host-only"); Args.push_back(CompileArg); if (!runToolOnCodeWithArgs(Factory->create(), CudaHeader + Code, Args)) { |