summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAMDGPU.td13
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td13
-rw-r--r--llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/sibling-call.ll2
9 files changed, 155 insertions, 22 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1284939b447..408ab023656 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -768,6 +768,19 @@ def int_amdgcn_readlane :
GCCBuiltin<"__builtin_amdgcn_readlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
+// The value to write and lane select arguments must be uniform across the
+// currently active threads of the current wave. Otherwise, the result is
+// undefined.
+def int_amdgcn_writelane :
+ GCCBuiltin<"__builtin_amdgcn_writelane">,
+ Intrinsic<[llvm_i32_ty], [
+ llvm_i32_ty, // uniform value to write: returned by the selected lane
+ llvm_i32_ty, // uniform lane select
+ llvm_i32_ty // returned by all lanes other than the selected one
+ ],
+ [IntrNoMem, IntrConvergent]
+>;
+
def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 654b96f792b..86c5f623dd0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2711,8 +2711,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- // Verify VOP*
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+ // Verify VOP*. Ignore multiple sgpr operands on writelane.
+ if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+ && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
@@ -3147,6 +3148,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
legalizeOpWithMove(MI, Src0Idx);
}
+ // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+ // both the value to write (src0) and lane select (src1). Fix up non-SGPR
+ // src0/src1 with V_READFIRSTLANE.
+ if (Opc == AMDGPU::V_WRITELANE_B32) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src0);
+ Src0.ChangeToRegister(Reg, false);
+ }
+ if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ return;
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 21658ece2f2..0ade120487f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -636,6 +636,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
= MFI->getSGPRToVGPRSpills(Index);
@@ -732,11 +733,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+ // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+ // only circumstance in which we say it is undefined is when it is the
+ // first spill to this VGPR in the first basic block.
+ bool VGPRDefined = true;
+ if (MBB == &MF->front())
+ VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+ // Mark the "old value of vgpr" input undef only if this is the first sgpr
+ // spill to this specific vgpr in the first basic block.
BuildMI(*MBB, MI, DL,
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
Spill.VGPR)
.addReg(SubReg, getKillRegState(IsKill))
- .addImm(Spill.Lane);
+ .addImm(Spill.Lane)
+ .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
// FIXME: Since this spills to another register instead of an actual
// frame index, we should delete the frame index when all references to
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8c49fd825e3..40bc0f06e60 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -323,15 +323,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
let HasSDWA9 = 0;
}
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
let Outs32 = (outs VGPR_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
let HasExt = 0;
let HasSDWA9 = 0;
+ let HasSrc2 = 0;
+ let HasSrc2Mods = 0;
}
//===----------------------------------------------------------------------===//
@@ -399,7 +401,10 @@ let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
[(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+ [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -640,7 +645,7 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;
defm V_READLANE_B32 : VOP2_Real_si <0x01>;
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
}
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index f164d8179ed..b56ec379bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -33,16 +33,14 @@ entry:
; GCN-DAG: buffer_store_dword v32
; GCN-DAG: buffer_store_dword v33
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: v_writelane_b32
-
+; GCN-DAG: v_writelane_b32
; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
-
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
-; GCN: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
+; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
-; GCN: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
; GCN: s_swappc_b64
@@ -80,10 +78,10 @@ entry:
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
; GCN-NOT: s_add_u32 s32, s32, 0x800
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 2c8abf50090..14a6193b558 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -44,7 +44,7 @@ define void @callee_with_stack() #0 {
; GCN-DAG: v_writelane_b32 v32, s35,
; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: s_mov_b32 s33, s5
diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index e39fb65a2a1..f83fd1c8f47 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -308,7 +308,7 @@ body: |
bb.1:
$vgpr0,$sgpr0_sgpr1 = V_ADD_I32_e64 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0
+ $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0, $vgpr4
S_BRANCH %bb.2
bb.2:
@@ -318,7 +318,7 @@ body: |
bb.3:
$vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo
+ $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
S_ENDPGM
...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
new file mode 100644
index 00000000000..361756a013b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+
+; CHECK-LABEL: {{^}}test_writelane_sreg:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+ %oldval = load i32, i32 addrspace(1)* %out
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+ %oldval = load i32, i32 addrspace(1)* %out
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
+define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+ %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
+ %oldval = load i32, i32 addrspace(1)* %out
+ %lane = extractelement <2 x i32> %args, i32 1
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; TODO: m0 should be folded.
+; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
+; CHECK: s_mov_b32 m0, -1
+; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+ %oldval = load i32, i32 addrspace(1)* %out
+ %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %oldval = load i32, i32 addrspace(1)* %out
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
+; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
+; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
+; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+ %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+ store i32 %writelane, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index f7e8a1d80e9..1c0076c9cd9 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -216,7 +216,7 @@ entry:
; GCN-DAG: v_writelane_b32 v34, s35, 2
; GCN-DAG: s_add_u32 s32, s32, 0x400
-; GCN: s_getpc_b64
+; GCN-DAG: s_getpc_b64
; GCN: s_swappc_b64
; GCN: s_getpc_b64 s[6:7]
OpenPOWER on IntegriCloud