2 files changed, 20 insertions, 10 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index c784fec1366..41c1da333ff 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2616,17 +2616,19 @@ class OffloadingActionBuilder final {
             C.MakeAction<LinkJobAction>(CudaDeviceActions,
                 types::TY_HIP_FATBIN);
 
-        DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
-               AssociatedOffloadKind);
-        // Clear the fat binary, it is already a dependence to an host
-        // action.
-        CudaFatBinary = nullptr;
+        if (!CompileDeviceOnly) {
+          DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                 AssociatedOffloadKind);
+          // Clear the fat binary, it is already a dependence to an host
+          // action.
+          CudaFatBinary = nullptr;
+        }
 
         // Remove the CUDA actions as they are already connected to an host
         // action or fat binary.
         CudaDeviceActions.clear();
 
-        return ABRT_Success;
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
       } else if (CurPhase == phases::Link) {
         // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
         // This happens to each device action originated from each input file.
@@ -3014,8 +3016,10 @@ public:
     }
 
     // If we can use the bundler, replace the host action by the bundling one in
-    // the resulting list. Otherwise, just append the device actions.
-    if (CanUseBundler && !OffloadAL.empty()) {
+    // the resulting list. Otherwise, just append the device actions. For
+    // device only compilation, HostAction is a null pointer, therefore only do
+    // this when HostAction is not a null pointer.
+    if (CanUseBundler && HostAction && !OffloadAL.empty()) {
       // Add the host action to the list in order to create the bundling action.
       OffloadAL.push_back(HostAction);
 
diff --git a/clang/test/Driver/cuda-phases.cu b/clang/test/Driver/cuda-phases.cu
index 2160d19f5c6..58be50ae2e1 100644
--- a/clang/test/Driver/cuda-phases.cu
+++ b/clang/test/Driver/cuda-phases.cu
@@ -157,6 +157,7 @@
 // HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
 // HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
 // HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
+// HBIN-NOT: device
 //
 // Test single gpu architecture up to the assemble phase in host-only
 // compilation mode.
@@ -172,6 +173,7 @@
 // HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
 // HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HASM-NOT: device
 
 //
 // Test two gpu architectures with complete compilation in host-only
@@ -190,6 +192,7 @@
 // HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
 // HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-[[T]])
 // HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-[[T]])
+// HBIN2-NOT: device
 
 //
 // Test two gpu architectures up to the assemble phase in host-only
@@ -206,6 +209,7 @@
 // HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
 // HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// HASM2-NOT: device
 
 //
 // Test single gpu architecture with complete compilation in device-only
@@ -224,7 +228,7 @@
 // DBIN_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
 // DBIN_NV-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-[[T]], [[ARCH]])
 // DBIN_NV-DAG: [[P5:[0-9]+]]: offload, "device-[[T]] (nvptx64-nvidia-cuda:[[ARCH]])" {[[P4]]}, object
-
+// DBIN-NOT: host
 //
 // Test single gpu architecture up to the assemble phase in device-only
 // compilation mode.
@@ -241,6 +245,7 @@
 // DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-[[T]], [[ARCH]])
 // DASM_NV-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-[[T]], [[ARCH]])
 // DASM_NV-DAG: [[P4:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE:nvptx64-nvidia-cuda|amdgcn-amd-amdhsa]]:[[ARCH]])" {[[P3]]}, assembler
+// DASM-NOT: host
 
 //
 // Test two gpu architectures with complete compilation in device-only
@@ -265,7 +270,7 @@
 // DBIN2_NV-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-[[T]], [[ARCH2]])
 // DBIN2_NV-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-[[T]], [[ARCH2]])
 // DBIN2_NV-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P10]]}, object
-
+// DBIN2-NOT: host
 //
 // Test two gpu architectures up to the assemble phase in device-only
 // compilation mode.
@@ -288,3 +293,4 @@
 // DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-[[T]], [[ARCH2]])
 // DASM2_NV-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
 // DASM2_NV-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
+// DASM2-NOT: host