diff options
| author | cdevadas <cdevadas@amd.com> | 2019-10-26 14:33:36 +0530 |
|---|---|---|
| committer | cdevadas <cdevadas@amd.com> | 2019-10-26 14:37:45 +0530 |
| commit | e921ede54068b94702efcc1654dd0027c844012c (patch) | |
| tree | 73b51e9005851c71c03411bc631730ce1b1236ff | |
| parent | 5e307808557f4786c6438c9cfd67784073c5a3b7 (diff) | |
| download | bcm5719-llvm-e921ede54068b94702efcc1654dd0027c844012c.tar.gz bcm5719-llvm-e921ede54068b94702efcc1654dd0027c844012c.zip | |
[AMDGPU] Fix Vreg_1 PHI lowering in SILowerI1Copies.
There is a minor flaw in the implementation of function lowerPhis.
This function replaces values of regclass Vreg_1 (boolean values)
involved in PHIs into an SGPR. Currently it iterates over the MBBs
and performs an inplace lowering of PHIs and fails to lower any
incoming value that itself is another PHI of Vreg_1 regclass.
The failure occurs only when the MBB where the incoming PHI value
belongs is not visited/lowered yet.
To fix this problem, collect all Vreg_1 PHIs upfront and then
perform the lowering.
Differential Revision: https://reviews.llvm.org/D69182
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 179 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir | 140 |
2 files changed, 229 insertions, 90 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index b4541253635..0eb64972d61 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -541,7 +541,7 @@ void SILowerI1Copies::lowerPhis() { MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT); - SmallVector<MachineInstr *, 4> DeadPhis; + SmallVector<MachineInstr *, 4> Vreg1Phis; SmallVector<MachineBasicBlock *, 4> IncomingBlocks; SmallVector<unsigned, 4> IncomingRegs; SmallVector<unsigned, 4> IncomingUpdated; @@ -550,118 +550,117 @@ void SILowerI1Copies::lowerPhis() { #endif for (MachineBasicBlock &MBB : *MF) { - LF.initialize(MBB); - for (MachineInstr &MI : MBB.phis()) { - Register DstReg = MI.getOperand(0).getReg(); - if (!isVreg1(DstReg)) - continue; + if (isVreg1(MI.getOperand(0).getReg())) + Vreg1Phis.push_back(&MI); + } + } - LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + MachineBasicBlock *PrevMBB = nullptr; + for (MachineInstr *MI : Vreg1Phis) { + MachineBasicBlock &MBB = *MI->getParent(); + if (&MBB != PrevMBB) { + LF.initialize(MBB); + PrevMBB = &MBB; + } - MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); + LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); - // Collect incoming values. - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - assert(i + 1 < MI.getNumOperands()); - Register IncomingReg = MI.getOperand(i).getReg(); - MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); - MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - - if (IncomingDef->getOpcode() == AMDGPU::COPY) { - IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); - assert(!IncomingDef->getOperand(1).getSubReg()); - } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { - continue; - } else { - assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); - } + Register DstReg = MI->getOperand(0).getReg(); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { + assert(i + 1 < MI->getNumOperands()); + Register IncomingReg = MI->getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - IncomingBlocks.push_back(IncomingMBB); - IncomingRegs.push_back(IncomingReg); + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); } + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + #ifndef NDEBUG - PhiRegisters.insert(DstReg); + PhiRegisters.insert(DstReg); #endif - // Phis in a loop that are observed outside the loop receive a simple but - // conservatively correct treatment. - std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) - DomBlocks.push_back(Use.getParent()); + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + std::vector<MachineBasicBlock *> DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); - MachineBasicBlock *PostDomBound = - PDT->findNearestCommonDominator(DomBlocks); - unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - - SSAUpdater.Initialize(DstReg); - - if (FoundLoopLevel) { - LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(IncomingBlocks[i], - IncomingUpdated.back()); - } + SSAUpdater.Initialize(DstReg); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); - } - } else { - // The phi is not observed from outside a loop. Use a more accurate - // lowering. - PIA.analyze(MBB, IncomingBlocks); - - for (MachineBasicBlock *MBB : PIA.predecessors()) - SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); - - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - if (PIA.isSource(IMBB)) { - IncomingUpdated.push_back(0); - SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); - } else { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); - } - } + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - if (!IncomingUpdated[i]) - continue; + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); - if (NewReg != DstReg) { - MRI->replaceRegWith(NewReg, DstReg); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; - // Ensure that DstReg has a single def and mark the old PHI node for - // deletion. - MI.getOperand(0).setReg(NewReg); - DeadPhis.push_back(&MI); + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); } - - IncomingBlocks.clear(); - IncomingRegs.clear(); - IncomingUpdated.clear(); } - for (MachineInstr *MI : DeadPhis) + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); MI->eraseFromParent(); - DeadPhis.clear(); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); } } diff --git a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir new file mode 100644 index 00000000000..1d3afa80a4e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir @@ -0,0 +1,140 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=si-i1-copies %s -o - | FileCheck -check-prefix=GCN %s +--- + +name: kernel_i1_copy_phi_with_phi_incoming_value +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: kernel_i1_copy_phi_with_phi_incoming_value + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x40000000), %bb.5(0x40000000) + ; GCN: liveins: $vgpr0, $sgpr4_sgpr5 + ; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; GCN: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32) + ; GCN: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]](s32), [[S_LOAD_DWORD_IMM]], implicit $exec + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: S_BRANCH %bb.1 + ; GCN: bb.1: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; GCN: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY3]], killed [[S_MOV_B32_]], 0, implicit $exec + ; GCN: [[V_CMP_GE_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GE_I32_e64 [[V_ADD_U32_e64_]], [[COPY2]], implicit $exec + ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY [[V_CMP_GE_I32_e64_]] + ; GCN: S_BRANCH %bb.6 + ; GCN: bb.2: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: [[PHI:%[0-9]+]]:sreg_64 = PHI %15, %bb.6 + ; GCN: SI_END_CF [[PHI]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1 + ; GCN: [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN: S_BRANCH %bb.5 + ; GCN: bb.3: + ; GCN: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; GCN: ATOMIC_FENCE 5, 2 + ; GCN: S_BARRIER + ; GCN: ATOMIC_FENCE 4, 2 + ; GCN: [[COPY6:%[0-9]+]]:sreg_64 = COPY %18 + ; GCN: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[COPY6]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: S_BRANCH %bb.4 + ; GCN: bb.4: + ; GCN: successors: %bb.7(0x80000000) + ; GCN: S_BRANCH %bb.7 + ; GCN: bb.5: + ; GCN: successors: %bb.3(0x80000000) + ; GCN: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, [[COPY5]], %bb.2 + ; GCN: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: S_BRANCH %bb.3 + ; GCN: bb.6: + ; GCN: successors: %bb.2(0x40000000), %bb.6(0x40000000) + ; GCN: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_1]], %bb.1, %15, %bb.6 + ; GCN: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[COPY4]] + ; GCN: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI2]], implicit-def dead $scc + ; GCN: SI_LOOP [[SI_IF_BREAK]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.7: + ; GCN: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.5 + liveins: $vgpr0, $sgpr4_sgpr5 + + %1:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %2:vgpr_32(s32) = COPY $vgpr0 + %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1:sgpr_64(p4), 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + %8:sreg_32 = COPY %3:sreg_32_xm0_xexec + %14:vgpr_32 = COPY %2:vgpr_32(s32) + %9:sreg_64 = V_CMP_LT_I32_e64 %2:vgpr_32(s32), %3:sreg_32_xm0_xexec, implicit $exec + %4:sreg_64 = S_MOV_B64 0 + %17:vreg_1 = COPY %4:sreg_64, implicit $exec + %16:sreg_64 = SI_IF killed %9:sreg_64, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + ; predecessors: %bb.0 + successors: %bb.6 + + %10:sreg_32 = S_MOV_B32 16 + %18:vgpr_32 = V_ADD_U32_e64 %14:vgpr_32, killed %10:sreg_32, 0, implicit $exec + %11:sreg_64 = V_CMP_GE_I32_e64 %18:vgpr_32, %8:sreg_32, implicit $exec + %12:sreg_64 = S_MOV_B64 0 + %19:vreg_1 = COPY %11:sreg_64 + S_BRANCH %bb.6 + + bb.2: + ; predecessors: %bb.6 + successors: %bb.5 + + %20:sreg_64 = PHI %6:sreg_64, %bb.6 + SI_END_CF %20:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %15:sreg_64 = S_MOV_B64 -1 + %21:vreg_1 = COPY %15:sreg_64, implicit $exec + S_BRANCH %bb.5 + + bb.3: + ; predecessors: %bb.5 + successors: %bb.4, %bb.7 + + %22:vreg_1 = PHI %7:vreg_1, %bb.5 + ATOMIC_FENCE 5, 2 + S_BARRIER + ATOMIC_FENCE 4, 2 + %23:sreg_64 = COPY %22:vreg_1 + %24:sreg_64 = SI_IF %23:sreg_64, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.4 + + bb.4: + ; predecessors: %bb.3 + successors: %bb.7 + + S_BRANCH %bb.7 + + bb.5: + ; predecessors: %bb.0, %bb.2 + successors: %bb.3 + + %7:vreg_1 = PHI %17:vreg_1, %bb.0, %21:vreg_1, %bb.2 + SI_END_CF %16:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + + bb.6: + ; predecessors: %bb.1, %bb.6 + successors: %bb.2, %bb.6 + + %5:sreg_64 = PHI %12:sreg_64, %bb.1, %6:sreg_64, %bb.6 + %13:sreg_64 = COPY %19:vreg_1 + %6:sreg_64 = SI_IF_BREAK %13:sreg_64, %5:sreg_64, implicit-def dead $scc + SI_LOOP %6:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.2 + + bb.7: + ; predecessors: %bb.3, %bb.4 + + SI_END_CF %24:sreg_64, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 + +... |

