AMDGPU: Default to using scalar mov to materialize immediate

This is the conservatively correct way because it's easy to move or replace a scalar immediate. This was incorrect in the case when the register class wasn't known from the static instruction definition, but still needed to be an SGPR. The main example of this is inlineasm has an SGPR constraint. Also start verifying the register classes of inlineasm operands. llvm-svn: 285762
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-11-01 22:55:07 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2016-11-01 22:55:07 +0000
commit: 3d463193a961c891ec1b49d07de4729794cc5b14 (patch)
tree: 6084e14c2d67c2a90ab6d3d40c288fdd8180ad52
parent: 8a99f120bc831d424185b65809903a1212989348 (diff)
download: bcm5719-llvm-3d463193a961c891ec1b49d07de4729794cc5b14.tar.gz
bcm5719-llvm-3d463193a961c891ec1b49d07de4729794cc5b14.zip
6 files changed, 86 insertions, 23 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 905ee467319..108995a463f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1943,6 +1943,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     return false;
   }
 
+  if (MI.isInlineAsm()) {
+    // Verify register classes for inlineasm constraints.
+    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+         I != E; ++I) {
+      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+      if (!RC)
+        continue;
+
+      const MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg())
+        continue;
+
+      unsigned Reg = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+        ErrInfo = "inlineasm operand has incorrect register class.";
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI.getOperand(i).isFPImm()) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a56d909624f..f19e99e7cd1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -265,19 +265,25 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
   return isInlineImmediate(N);
 }]>;
 
-class SGPRImm <dag frag> : PatLeaf<frag, [{
+class VGPRImm <dag frag> : PatLeaf<frag, [{
   if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  unsigned Limit = 0;
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                U != E; ++U) {
+         Limit < 10 && U != E; ++U, ++Limit) {
     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-    if (RC && SIRI->isSGPRClass(RC))
-      return true;
+
+    // If the register class is unknown, it could be an unknown
+    // register class that needs to be an SGPR, e.g. an inline asm
+    // constraint
+    if (!RC || SIRI->isSGPRClass(RC))
+      return false;
   }
-  return false;
+
+  return Limit < 10;
 }]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f3ccee288fd..4122eb915f3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -615,23 +615,23 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
-  (SGPRImm<(i32 imm)>:$imm),
-  (S_MOV_B32 imm:$imm)
+  (VGPRImm<(i32 imm)>:$imm),
+  (V_MOV_B32_e32 imm:$imm)
 >;
 
 def : Pat <
-  (SGPRImm<(f32 fpimm)>:$imm),
-  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (VGPRImm<(f32 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : Pat <
   (i32 imm:$imm),
-  (V_MOV_B32_e32 imm:$imm)
+  (S_MOV_B32 imm:$imm)
 >;
 
 def : Pat <
   (f32 fpimm:$imm),
-  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
 def : Pat <
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index ad5e8908340..27b3cc0e435 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -15,7 +15,7 @@
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
@@ -93,24 +93,24 @@ endif:
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
+; Spill load
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:12 ; 8-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
-
-; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -120,7 +120,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -133,11 +133,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:12 ; 8-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 7282e89e85b..3c0bb75a607 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -26,8 +26,43 @@ entry:
 ; GCN: s_mov_b32 m0, -1
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0
 ; GCN: ; use m0
-define void @inline_sreg_constraint_m0(i32 addrspace(1)* %ptr) {
+define void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
 }
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_i32() {
+  tail call void asm sideeffect "; use $0", "s"(i32 32)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_f32() {
+  tail call void asm sideeffect "; use $0", "s"(float 1.0)
+  ret void
+}
+
+; FIXME: Should be able to use s_mov_b64
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_i64() {
+  tail call void asm sideeffect "; use $0", "s"(i64 -4)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_f64() {
+  tail call void asm sideeffect "; use $0", "s"(double 1.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 0cdb1c9fb3a..37da9c5d5ad 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -15,7 +15,7 @@
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
+; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-11-01 22:55:07 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2016-11-01 22:55:07 +0000
commit	3d463193a961c891ec1b49d07de4729794cc5b14 (patch)
tree	6084e14c2d67c2a90ab6d3d40c288fdd8180ad52
parent	8a99f120bc831d424185b65809903a1212989348 (diff)
download	bcm5719-llvm-3d463193a961c891ec1b49d07de4729794cc5b14.tar.gz bcm5719-llvm-3d463193a961c891ec1b49d07de4729794cc5b14.zip