diff options
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGCUDANV.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Driver/Action.cpp | 2 | ||||
-rw-r--r-- | clang/lib/Driver/Driver.cpp | 52 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains.cpp | 27 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains.h | 11 | ||||
-rw-r--r-- | clang/lib/Driver/Tools.cpp | 78 | ||||
-rw-r--r-- | clang/lib/Driver/Tools.h | 35 | ||||
-rw-r--r-- | clang/lib/Driver/Types.cpp | 3 |
8 files changed, 184 insertions, 26 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 045e19b189d..9dd7928bcf9 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -259,6 +259,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, llvm::ConstantStruct::get(FatbinWrapperTy, Values), "__cuda_fatbin_wrapper"); + // NVIDIA's cuobjdump looks for fatbins in this section. + FatbinWrapper->setSection(".nvFatBinSegment"); // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp index e9490e96db8..b45f290efc1 100644 --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -75,7 +75,7 @@ CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel) : Action(CudaDeviceClass, Input), GpuArchName(ArchName), AtTopLevel(AtTopLevel) { - assert(IsValidGpuArchName(GpuArchName)); + assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); } const char *CudaDeviceAction::getComputeArchName() const { diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index e9a37c6aa81..0cdfb4fe105 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -949,8 +949,9 @@ static unsigned PrintActions1(const Compilation &C, Action *A, os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->begin(), Ids) << "}"; } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { - os << '"' << CDA->getGpuArchName() << '"' << ", {" - << PrintActions1(C, *CDA->begin(), Ids) << "}"; + os << '"' + << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") + << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}"; } else { const ActionList *AL; if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) { @@ -1327,7 +1328,7 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args, // Check whether any of device actions stopped before they could generate PTX. bool PartialCompilation = llvm::any_of(CudaDeviceActions, [](const Action *a) { - return a->getKind() != Action::BackendJobClass; + return a->getKind() != Action::AssembleJobClass; }); // Figure out what to do with device actions -- pass them as inputs to the @@ -1356,16 +1357,32 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args, return HostAction; } - // Outputs of device actions during complete CUDA compilation get created - // with AtTopLevel=false and become inputs for the host action. + // If we're not a partial or device-only compilation, we compile each arch to + // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device + // "link" action, which uses fatbinary to combine these cubins into one + // fatbin. The fatbin is then an input to the host compilation. ActionList DeviceActions; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - DeviceActions.push_back( - C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I], - /* AtTopLevel */ false)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + Action* AssembleAction = CudaDeviceActions[I]; + assert(AssembleAction->getType() == types::TY_Object); + assert(AssembleAction->getInputs().size() == 1); + + Action* BackendAction = AssembleAction->getInputs()[0]; + assert(BackendAction->getType() == types::TY_PP_Asm); + + for (const auto& A : {AssembleAction, BackendAction}) { + DeviceActions.push_back(C.MakeAction<CudaDeviceAction>( + A, GpuArchList[I], /* AtTopLevel */ false)); + } + } + auto FatbinAction = C.MakeAction<CudaDeviceAction>( + C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN), + /* GpuArchName = */ nullptr, + /* AtTopLevel = */ false); // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction<CudaHostAction>(HostAction, DeviceActions); + return C.MakeAction<CudaHostAction>(std::move(HostAction), + ActionList({FatbinAction})); } void Driver::BuildActions(Compilation &C, const ToolChain &TC, @@ -1600,7 +1617,7 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ToolChain &TC, return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm); } case phases::Assemble: - return C.MakeAction<AssembleJobAction>(Input, types::TY_Object); + return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object); } llvm_unreachable("invalid phase in ConstructPhaseAction"); @@ -1849,11 +1866,14 @@ InputInfo Driver::BuildJobsForActionNoCache( if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) { // Initial processing of CudaDeviceAction carries host params. // Call BuildJobsForAction() again, now with correct device parameters. - assert(CDA->getGpuArchName() && "No GPU name in device action."); - return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), - /*MultipleArchs*/ true, LinkingOutput, - CachedResults); + InputInfo II = BuildJobsForAction( + C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(), + CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput, + CachedResults); + // Currently II's Action is *CDA->begin(). Set it to CDA instead, so that + // one can retrieve II's GPU arch. + II.setAction(A); + return II; } const ActionList *Inputs = &A->getInputs(); diff --git a/clang/lib/Driver/ToolChains.cpp b/clang/lib/Driver/ToolChains.cpp index e3f25f1e32e..15b36778220 100644 --- a/clang/lib/Driver/ToolChains.cpp +++ b/clang/lib/Driver/ToolChains.cpp @@ -1652,13 +1652,14 @@ void Generic_GCC::CudaInstallationDetector::init( continue; CudaInstallPath = CudaPath; + CudaBinPath = CudaPath + "/bin"; CudaIncludePath = CudaInstallPath + "/include"; CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice"; CudaLibPath = CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib"); if (!(D.getVFS().exists(CudaIncludePath) && - D.getVFS().exists(CudaLibPath) && + D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) && D.getVFS().exists(CudaLibDevicePath))) continue; @@ -4182,13 +4183,16 @@ Tool *DragonFly::buildLinker() const { return new tools::dragonfly::Linker(*this); } -/// Stub for CUDA toolchain. At the moment we don't have assembler or -/// linker and need toolchain mainly to propagate device-side options -/// to CC1. +/// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary, +/// which isn't properly a linker but nonetheless performs the step of stitching +/// together object files from the assembler into a single blob. CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) - : Linux(D, Triple, Args) {} + : Linux(D, Triple, Args) { + if (CudaInstallation.isValid()) + getProgramPaths().push_back(CudaInstallation.getBinPath()); +} void CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, @@ -4222,7 +4226,7 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, for (Arg *A : Args) { if (A->getOption().matches(options::OPT_Xarch__)) { // Skip this argument unless the architecture matches BoundArch - if (A->getValue(0) != StringRef(BoundArch)) + if (!BoundArch || A->getValue(0) != StringRef(BoundArch)) continue; unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); @@ -4253,10 +4257,19 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, DAL->append(A); } - DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); + if (BoundArch) + DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch); return DAL; } +Tool *CudaToolChain::buildAssembler() const { + return new tools::NVPTX::Assembler(*this); +} + +Tool *CudaToolChain::buildLinker() const { + return new tools::NVPTX::Linker(*this); +} + /// XCore tool chain XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) diff --git a/clang/lib/Driver/ToolChains.h b/clang/lib/Driver/ToolChains.h index 622c414c62e..a5f54733e8b 100644 --- a/clang/lib/Driver/ToolChains.h +++ b/clang/lib/Driver/ToolChains.h @@ -163,6 +163,7 @@ protected: bool IsValid; const Driver &D; std::string CudaInstallPath; + std::string CudaBinPath; std::string CudaLibPath; std::string CudaLibDevicePath; std::string CudaIncludePath; @@ -179,6 +180,8 @@ protected: /// \brief Get the detected Cuda installation path. StringRef getInstallPath() const { return CudaInstallPath; } + /// \brief Get the detected path to Cuda's bin directory. + StringRef getBinPath() const { return CudaBinPath; } /// \brief Get the detected Cuda Include path. StringRef getIncludePath() const { return CudaIncludePath; } /// \brief Get the detected Cuda library path. @@ -816,6 +819,14 @@ public: const char *BoundArch) const override; void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + + // Never try to use the integrated assembler with CUDA; always fork out to + // ptxas. + bool useIntegratedAs() const override { return false; } + +protected: + Tool *buildAssembler() const override; // ptxas + Tool *buildLinker() const override; // fatbinary (ok, not really a linker) }; class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux { diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp index 5e4777b4666..e498f98355c 100644 --- a/clang/lib/Driver/Tools.cpp +++ b/clang/lib/Driver/Tools.cpp @@ -10625,3 +10625,81 @@ void PS4cpu::Link::ConstructJob(Compilation &C, const JobAction &JA, else ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput); } + +void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast<const toolchains::CudaToolChain &>(getToolChain()); + assert(TC.getArch() == llvm::Triple::nvptx || + TC.getArch() == llvm::Triple::nvptx64); + + std::vector<std::string> gpu_archs = + Args.getAllArgValues(options::OPT_march_EQ); + assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); + const std::string& gpu_arch = gpu_archs[0]; + + + ArgStringList CmdArgs; + CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); + + // Clang's default optimization level is -O0, but ptxas's default is -O3. + CmdArgs.push_back(Args.MakeArgString( + llvm::Twine("-O") + + Args.getLastArgValue(options::OPT_O_Group, "0").data())); + + // Don't bother passing -g to ptxas: It's enabled by default at -O0, and + // not supported at other optimization levels. + + CmdArgs.push_back("--gpu-name"); + CmdArgs.push_back(Args.MakeArgString(gpu_arch)); + CmdArgs.push_back("--output-file"); + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + for (const auto& II : Inputs) + CmdArgs.push_back(Args.MakeArgString(II.getFilename())); + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) + CmdArgs.push_back(Args.MakeArgString(A)); + + const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas")); + C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs)); +} + +// All inputs to this linker must be from CudaDeviceActions, as we need to look +// at the Inputs' Actions in order to figure out which GPU architecture they +// correspond to. +void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, + const InputInfoList &Inputs, + const ArgList &Args, + const char *LinkingOutput) const { + const auto &TC = + static_cast<const toolchains::CudaToolChain &>(getToolChain()); + assert(TC.getArch() == llvm::Triple::nvptx || + TC.getArch() == llvm::Triple::nvptx64); + + ArgStringList CmdArgs; + CmdArgs.push_back("--cuda"); + CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back(Args.MakeArgString("--create")); + CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); + + for (const auto& II : Inputs) { + auto* A = cast<const CudaDeviceAction>(II.getAction()); + // We need to pass an Arch of the form "sm_XX" for cubin files and + // "compute_XX" for ptx. + const char *Arch = (II.getType() == types::TY_PP_Asm) + ? A->getComputeArchName() + : A->getGpuArchName(); + CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + + Arch + ",file=" + II.getFilename())); + } + + for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) + CmdArgs.push_back(Args.MakeArgString(A)); + + const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary")); + C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs)); +} diff --git a/clang/lib/Driver/Tools.h b/clang/lib/Driver/Tools.h index 2b137f4a6d0..69c046587f4 100644 --- a/clang/lib/Driver/Tools.h +++ b/clang/lib/Driver/Tools.h @@ -903,6 +903,41 @@ public: }; } // end namespace PS4cpu +namespace NVPTX { + +// Run ptxas, the NVPTX assembler. +class LLVM_LIBRARY_VISIBILITY Assembler : public Tool { + public: + Assembler(const ToolChain &TC) + : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX +// assembly into a single output file. +class LLVM_LIBRARY_VISIBILITY Linker : public Tool { + public: + Linker(const ToolChain &TC) + : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8, + "--options-file") {} + + bool hasIntegratedCPP() const override { return false; } + + void ConstructJob(Compilation &C, const JobAction &JA, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, + const char *LinkingOutput) const override; +}; + +} // end namespace NVPTX + } // end namespace tools } // end namespace driver } // end namespace clang diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp index c29ce9462a0..3b3b67fc5d5 100644 --- a/clang/lib/Driver/Types.cpp +++ b/clang/lib/Driver/Types.cpp @@ -232,8 +232,7 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) { P.push_back(phases::Compile); P.push_back(phases::Backend); } - if (Id != TY_CUDA_DEVICE) - P.push_back(phases::Assemble); + P.push_back(phases::Assemble); } } |