diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 187 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/indirect-addressing-undef.mir | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 145 |
3 files changed, 272 insertions, 70 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 3d6fc9eeba7..f989b5b9bb0 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -76,7 +76,7 @@ private: bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI); void If(MachineInstr &MI); void Else(MachineInstr &MI, bool ExecModified); @@ -89,12 +89,16 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); + std::pair<MachineBasicBlock *, MachineBasicBlock *> + splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + + void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg); void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, MachineInstr *MovRel, @@ -171,7 +175,19 @@ bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - if (++NumInstr >= SkipThreshold) + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } + + if (NumInstr >= SkipThreshold) return true; } } @@ -189,36 +205,55 @@ void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { .addOperand(To); } -void SILowerControlFlow::SkipIfDead(MachineInstr &MI) { - +bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) - return; + return false; - MachineBasicBlock::iterator Insert = &MI; - ++Insert; + LivePhysRegs RemainderLiveRegs(TRI); + RemainderLiveRegs.addLiveOuts(MBB); + + MachineBasicBlock *SkipBB; + MachineBasicBlock *RemainderBB; + std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator()); + + const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(RemainderBB); + + MBB.addSuccessor(RemainderBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); + + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (unsigned Reg : RemainderLiveRegs) { + if (MRI.isAllocatable(Reg)) + RemainderBB->addLiveIn(Reg); + } + + return true; } void SILowerControlFlow::If(MachineInstr &MI) { @@ -386,20 +421,13 @@ void SILowerControlFlow::Kill(MachineInstr &MI) { } // All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI); - I != E; ++I) { - RemainderLiveRegs.stepBackward(*I); - } - +void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg) { // Add reg defined in loop body. RemainderLiveRegs.addReg(SaveReg); @@ -410,13 +438,11 @@ void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB, } } - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); for (unsigned Reg : RemainderLiveRegs) { if (MRI.isAllocatable(Reg)) RemainderBB.addLiveIn(Reg); } - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); if (!Src->isUndef()) LoopBB.addLiveIn(Src->getReg()); @@ -469,6 +495,30 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, .addMBB(&LoopBB); } +std::pair<MachineBasicBlock *, MachineBasicBlock *> +SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineFunction *MF = MBB.getParent(); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + // Returns true if a new block was inserted. bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); @@ -492,7 +542,6 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs return false; } - MachineFunction &MF = *MBB.getParent(); MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); SaveOp->setIsDead(false); unsigned Save = SaveOp->getReg(); @@ -505,25 +554,24 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) .addReg(AMDGPU::EXEC); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; + LivePhysRegs RemainderLiveRegs(TRI); - MF.insert(MBBI, LoopBB); - MF.insert(MBBI, RemainderBB); + RemainderLiveRegs.addLiveOuts(MBB); - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; - splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx); + std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - // Move the rest of the block into a new block. - RemainderBB->transferSuccessors(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - MBB.addSuccessor(LoopBB); + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LoopBB); + + splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, + *RemainderBB, Save, *Idx); emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); @@ -695,16 +743,25 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); HaveKill = false; + + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } } EndCf(MI); break; case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else + if (Depth == 0) { + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + } else HaveKill = true; Kill(MI); break; diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-undef.mir b/llvm/test/CodeGen/AMDGPU/indirect-addressing-undef.mir index 0e20a736b1e..345cd681dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-undef.mir @@ -3,7 +3,7 @@ # CHECK-LABEL: name: extract_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -103,7 +103,7 @@ body: | # CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -159,7 +159,7 @@ body: | # CHECK-LABEL: name: insert_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -215,7 +215,7 @@ body: | # CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -272,7 +272,7 @@ body: | # CHECK-LABEL: insert_undef_value_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll new file mode 100644 index 00000000000..1999c0cd683 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -0,0 +1,145 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { + call void @llvm.AMDGPU.kill(float 0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { + call void @llvm.AMDGPU.kill(float -0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_var: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var(float %x) #0 { + call void @llvm.AMDGPU.kill(float %x) + ret void +} + +; FIXME: why does the skip depend on the asm length in the same block? + +; CHECK-LABEL: {{^}}test_kill_control_flow: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK: ; BB#1: +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 + +; CHECK: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: {{^}}BB{{[0-9]+_[0-9]+}}: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + call void @llvm.AMDGPU.kill(float %var) + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#1: ; %bb +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: ;;#ASMEND +; CHECK: v_mov_b32_e64 v8, -1 +; CHECK: ;;#ASMEND +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: buffer_store_dword v8 +; CHECK: v_mov_b32_e64 v9, -2 + +; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}: +; CHECK: buffer_store_dword v9 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"() + call void @llvm.AMDGPU.kill(float %var) + store volatile float %live.across, float addrspace(1)* undef + %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"() + br label %exit + +exit: + %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ] + store float %phi, float addrspace(1)* undef + ret void +} + +declare void @llvm.AMDGPU.kill(float) #0 + +attributes #0 = { nounwind } |