3 files changed, 17 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 76c2644867a..b48b2391110 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3571,7 +3571,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
-                  isMemOpHasNoClobberedMemOperand(Load))
+        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 5b840a14dbc..73dd8b7daa4 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -229,6 +229,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
     ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
     (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
+    !Ld->isVolatile() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
diff --git a/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
new file mode 100644
index 00000000000..bced3c408c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/not-scalarize-volatile-load.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: @volatile_load
+; GCN:  s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
+; GCN:  v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; GCN:  v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
+; GCN:  flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
+  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
+  store i32 %tmp18, i32 addrspace(1)* %tmp26, align 4
+  ret void
+}