diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-12 21:41:32 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-12 21:41:32 +0000 |
| commit | 786724a22ecff6afa9484714be8448429fdd021c (patch) | |
| tree | c672bbed3539107738cf537e5aefc08937593127 /llvm | |
| parent | 8950ad12adfdf6f13426171643a0b56e91dd7fd1 (diff) | |
| download | bcm5719-llvm-786724a22ecff6afa9484714be8448429fdd021c.tar.gz bcm5719-llvm-786724a22ecff6afa9484714be8448429fdd021c.zip | |
AMDGPU: Follow up to r275203
I meant to squash this into it.
llvm-svn: 275220
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 63 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 51 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 123 |
6 files changed, 218 insertions, 39 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d98fedbacb0..72175ea581b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1070,9 +1070,64 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + StringRef(RegName) + "\".")); } -MachineBasicBlock * -SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, - MachineBasicBlock *BB) const { +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + SmallSet<unsigned, 8> SplitDefRegs; + for (auto I = SplitPoint, E = BB->end(); I != E; ++I) { + for (MachineOperand &Def : I->defs()) + SplitDefRegs.insert(Def.getReg()); + } + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + unsigned IncomingReg = MI.getOperand(I).getReg(); + MachineOperand &FromBB = MI.getOperand(I + 1); + if (BB == FromBB.getMBB()) { + if (SplitDefRegs.count(IncomingReg)) + FromBB.setMBB(SplitBB); + + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + +MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { case AMDGPU::SI_INIT_M0: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -1096,6 +1151,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); return BB; } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 6833e15e4fd..8e055eea58c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -123,6 +123,9 @@ public: unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + MachineBasicBlock *splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 7cf5faa216d..858505bea3b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1989,8 +1989,16 @@ def SI_END_CF : PseudoInstSI < let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : PseudoInstSI < (outs), (ins VSrc_32:$src), - [(int_AMDGPU_kill f32:$src)] ->; + [(int_AMDGPU_kill f32:$src)]> { + let isConvergent = 1; + let usesCustomInserter = 1; +} + +def SI_KILL_TERMINATOR : PseudoInstSI < + (outs), (ins VSrc_32:$src)> { + let isTerminator = 1; +} + } // End Uses = [EXEC], Defs = [EXEC,VCC] } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f989b5b9bb0..adb0919231c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -76,7 +76,7 @@ private: bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - bool skipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); void If(MachineInstr &MI); void Else(MachineInstr &MI, bool ExecModified); @@ -89,6 +89,9 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); + MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + std::pair<MachineBasicBlock *, MachineBasicBlock *> splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); @@ -205,27 +208,22 @@ void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { .addOperand(To); } -bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { +bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction *MF = MBB.getParent(); - if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) return false; - LivePhysRegs RemainderLiveRegs(TRI); - RemainderLiveRegs.addLiveOuts(MBB); - - MachineBasicBlock *SkipBB; - MachineBasicBlock *RemainderBB; - std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator()); + MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); + SkipBB->addSuccessor(&NextBB); const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(RemainderBB); - - MBB.addSuccessor(RemainderBB); + .addMBB(&NextBB); MachineBasicBlock::iterator Insert = SkipBB->begin(); @@ -244,15 +242,6 @@ bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { // ... and terminate wavefront. BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); - for (const MachineInstr &Inst : reverse(*RemainderBB)) - RemainderLiveRegs.stepBackward(Inst); - - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - for (unsigned Reg : RemainderLiveRegs) { - if (MRI.isAllocatable(Reg)) - RemainderBB->addLiveIn(Reg); - } - return true; } @@ -495,6 +484,20 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, .addMBB(&LoopBB); } +MachineBasicBlock *SILowerControlFlow::insertSkipBlock( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineFunction *MF = MBB.getParent(); + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, SkipBB); + MBB.addSuccessor(SkipBB); + + return SkipBB; +} + std::pair<MachineBasicBlock *, MachineBasicBlock *> SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { @@ -745,7 +748,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { if (--Depth == 0 && HaveKill) { HaveKill = false; - if (skipIfDead(MI)) { + if (skipIfDead(MI, *NextBB)) { NextBB = std::next(BI); BE = MF.end(); Next = MBB.end(); @@ -754,9 +757,9 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { EndCf(MI); break; - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_TERMINATOR: if (Depth == 0) { - if (skipIfDead(MI)) { + if (skipIfDead(MI, *NextBB)) { NextBB = std::next(BI); BE = MF.end(); Next = MBB.end(); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 100fb2896dd..dafc772ea4f 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -223,7 +223,7 @@ void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI, // Control flow-type instructions that are followed by WQM computations // must themselves be in WQM. if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && - (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL)) { + (MI.isBranch() || MI.isTerminator())) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } @@ -444,9 +444,6 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, State = Needs; } - - if (MI.getOpcode() == AMDGPU::SI_KILL) - WQMFromExec = false; } if ((BI.OutNeeds & StateWQM) && State != StateWQM) { diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 1999c0cd683..54fc0bf0b68 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,6 +2,7 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: ; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { call void @llvm.AMDGPU.kill(float 0.0) @@ -11,28 +12,87 @@ define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; BB#0: ; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { call void @llvm.AMDGPU.kill(float -0.0) ret void } +; FIXME: Ideally only one would be emitted +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: ; BB#2: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { + call void @llvm.AMDGPU.kill(float -0.0) + call void @llvm.AMDGPU.kill(float -1.0) + ret void +} + ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; BB#0: ; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { call void @llvm.AMDGPU.kill(float %x) ret void } +; FIXME: Ideally only one would be emitted +; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#2: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { + call void @llvm.AMDGPU.kill(float %x) + call void @llvm.AMDGPU.kill(float %x) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_var_x2: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1 +; CHECK-NEXT: ; BB#2: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { + call void @llvm.AMDGPU.kill(float %x) + call void @llvm.AMDGPU.kill(float %y) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: ; BB#2: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { + call void @llvm.AMDGPU.kill(float %x) + %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={VGPR7}"() + call void @llvm.AMDGPU.kill(float %y) + ret void +} + ; FIXME: why does the skip depend on the asm length in the same block? ; CHECK-LABEL: {{^}}test_kill_control_flow: ; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 ; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] -; CHECK: ; BB#1: +; CHECK-NEXT: ; BB#1: +; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 @@ -44,14 +104,13 @@ define amdgpu_ps void @test_kill_depth_var(float %x) #0 { ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 -; CHECK: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; BB#3: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: {{^}}[[SPLIT_BB]]: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 -; CHECK-NEXT: {{^}}BB{{[0-9]+_[0-9]+}}: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { entry: @@ -95,14 +154,14 @@ exit: ; CHECK: ;;#ASMEND ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: ; BB#4: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: {{^}}[[SPLIT_BB]]: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK: buffer_store_dword v8 ; CHECK: v_mov_b32_e64 v9, -2 @@ -140,6 +199,58 @@ exit: ret void } +; CHECK-LABEL: {{^}}test_kill_divergent_loop: +; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc +; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]] +; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; mask branch [[EXIT]] + +; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]: + +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 + +; CHECK-NEXT: ; BB#3: +; CHECK: buffer_load_dword [[LOAD:v[0-9]+]] +; CHECK: v_cmp_eq_i32_e32 vcc, 0, [[LOAD]] +; CHECK-NEXT: s_and_b64 vcc, exec, vcc +; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]] + +; CHECK-NEXT: {{^}}[[EXIT]]: +; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]] +; CHECK: buffer_store_dword +; CHECK: s_endpgm +define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + call void @llvm.AMDGPU.kill(float %var) + %vgpr = load volatile i32, i32 addrspace(1)* undef + %loop.cond = icmp eq i32 %vgpr, 0 + br i1 %loop.cond, label %bb, label %exit + +exit: + store volatile i32 8, i32 addrspace(1)* undef + ret void +} + + declare void @llvm.AMDGPU.kill(float) #0 attributes #0 = { nounwind } |

