diff options
author | Marek Olsak <marek.olsak@amd.com> | 2017-10-24 10:27:13 +0000 |
---|---|---|
committer | Marek Olsak <marek.olsak@amd.com> | 2017-10-24 10:27:13 +0000 |
commit | ce76ea03942d5190da13376c8c9f32d6cc671ed8 (patch) | |
tree | d7e8d096604a8aeec814b3f9700b8900e4951fba | |
parent | 2114fc3bcba7f84f1a1d9e7704b5eac9657814a4 (diff) | |
download | bcm5719-llvm-ce76ea03942d5190da13376c8c9f32d6cc671ed8.tar.gz bcm5719-llvm-ce76ea03942d5190da13376c8c9f32d6cc671ed8.zip |
AMDGPU: Add new intrinsic llvm.amdgcn.kill(i1)
Summary:
Kill the thread if operand 0 == false.
llvm.amdgcn.wqm.vote can be applied to the operand.
Also allow kill in all shader stages.
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D38544
llvm-svn: 316427
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInsertSkips.cpp | 113 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 50 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll | 241 | ||||
-rw-r--r-- | llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll | 15 |
13 files changed, 440 insertions, 37 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 4df10e87d18..f507f9c1668 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -753,6 +753,9 @@ def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent] >; +// If false, set EXEC=0 for the current thread until the end of program. +def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index b73dd41fc51..6498aafc6ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -167,7 +167,6 @@ def COND_OLE : PatLeaf < [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; - def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 64a4d06e95c..d729dcc439e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2449,7 +2449,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, if (SplitPoint == BB->end()) { // Don't bother with a new block. - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return BB; } @@ -2463,7 +2463,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, SplitBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(SplitBB); - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return SplitBB; } @@ -3017,7 +3017,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: return emitIndirectDst(MI, *BB, *getSubtarget()); - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + case AMDGPU::SI_KILL_I1_PSEUDO: return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index 9bd58c45ce0..1b8c9f27712 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -200,25 +200,101 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { void SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - // Clear this thread from the exec mask if the operand is negative. - if (Op.isImm()) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); + + switch (MI.getOpcode()) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { + unsigned Opcode = 0; + + // The opcodes are inverted because the inline immediate has to be + // the first operand, e.g. from "x < imm" to "imm > x" + switch (MI.getOperand(2).getImm()) { + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMPX_EQ_F32_e32; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMPX_LT_F32_e32; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMPX_LE_F32_e32; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMPX_GT_F32_e32; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMPX_GE_F32_e32; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMPX_LG_F32_e32; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMPX_O_F32_e32; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMPX_U_F32_e32; + break; + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMPX_NLG_F32_e32; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMPX_NGE_F32_e32; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMPX_NGT_F32_e32; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMPX_NLE_F32_e32; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMPX_NLT_F32_e32; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMPX_NEQ_F32_e32; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); + } + + // TODO: Allow this: + if (!MI.getOperand(0).isReg() || + !TRI->isVGPR(MBB.getParent()->getRegInfo(), + MI.getOperand(0).getReg())) + llvm_unreachable("SI_KILL operand should be a VGPR"); + + BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .add(MI.getOperand(1)) + .add(MI.getOperand(0)); + break; + } + case AMDGPU::SI_KILL_I1_TERMINATOR: { + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + assert(KillVal == 0 || KillVal == -1); + + // Kill all threads if Op0 is an immediate and equal to the Kill value. + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + assert(Imm == 0 || Imm == -1); + + if (Imm == KillVal) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + break; } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) + + unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; + BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) .add(Op); + break; + } + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } } @@ -311,7 +387,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; - case AMDGPU::SI_KILL_TERMINATOR: + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: MadeChange = true; kill(MI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 807e60632ae..06de0658a7d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4591,3 +4591,24 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } + +bool SIInstrInfo::isKillTerminator(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: + return true; + default: + return false; + } +} + +const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); + case AMDGPU::SI_KILL_I1_PSEUDO: + return get(AMDGPU::SI_KILL_I1_TERMINATOR); + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); + } +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 59725aeb1dc..f8de0efc5dd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -857,6 +857,9 @@ public: MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const; + + static bool isKillTerminator(unsigned Opcode); + const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 4f9d1dbd05c..1273f451e18 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -297,6 +297,10 @@ def as_i64imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; +def cond_as_i32imm: SDNodeXForm<cond, [{ + return CurDAG->getTargetConstant(N->get(), SDLoc(N), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6ae50b7866c..6cee5be9da9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -275,18 +275,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI < } let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : PseudoInstSI < - (outs), (ins VSrc_b32:$src), - [(AMDGPUkill i32:$src)]> { - let isConvergent = 1; - let usesCustomInserter = 1; -} -def SI_KILL_TERMINATOR : SPseudoInstSI < - (outs), (ins VSrc_b32:$src)> { - let isTerminator = 1; +multiclass PseudoInstKill <dag ins> { + def _PSEUDO : PseudoInstSI <(outs), ins> { + let isConvergent = 1; + let usesCustomInserter = 1; + } + + def _TERMINATOR : SPseudoInstSI <(outs), ins> { + let isTerminator = 1; + } } +defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; + def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), [], " ; illegal copy $src to $dst">; @@ -546,8 +549,35 @@ def : GCNPat< def : GCNPat < (int_AMDGPU_kilp), - (SI_KILL (i32 0xbf800000)) + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) + (AMDGPUkill (i32 -1082130432)), + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + (int_amdgcn_kill i1:$src), + (SI_KILL_I1_PSEUDO $src, 0) +>; + +def : Pat < + (int_amdgcn_kill (i1 (not i1:$src))), + (SI_KILL_I1_PSEUDO $src, -1) +>; + +def : Pat < + (AMDGPUkill i32:$src), + (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE +>; + +def : Pat < + (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))), + (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +// TODO: we could add more variants for other types of conditionals //===----------------------------------------------------------------------===// // VOP1 Patterns diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 8e19e159971..29fc5ef50db 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -134,7 +134,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { unsigned SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -143,7 +144,7 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { U->getOpcode() != AMDGPU::SI_END_CF) return false; - // Check for SI_KILL_TERMINATOR on path from if to endif. + // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplififcations are not safe. auto SMBB = MI.getParent(); auto EMBB = U->getParent(); @@ -157,7 +158,7 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { if (MBB == EMBB || !Visited.insert(MBB).second) continue; for(auto &Term : MBB->terminators()) - if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR) + if (TII->isKillTerminator(Term.getOpcode())) return false; Worklist.append(MBB->succ_begin(), MBB->succ_end()); @@ -184,7 +185,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI); + bool SimpleIf = isSimpleIf(MI, MRI, TII); // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index b2282cfd419..3a5debc3223 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3539,6 +3539,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, II->getArgOperand(0)); } + case Intrinsic::amdgcn_kill: { + const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0)); + if (!C || !C->getZExtValue()) + break; + + // amdgcn.kill(i1 1) is a no-op + return eraseInstFromFunction(CI); + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir index bd5f296affb..e3a559998be 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -33,7 +33,7 @@ body: | bb.1: successors: %bb.2 %vgpr0 = V_MOV_B32_e32 0, implicit %exec - SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + SI_KILL_F32_COND_IMM_TERMINATOR %vgpr0, 0, 3, implicit-def %exec, implicit-def %vcc, implicit %exec S_BRANCH %bb.2 bb.2: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll new file mode 100644 index 00000000000..a1ecb7f750c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -0,0 +1,241 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s + +; SI-LABEL: {{^}}gs_const: +; SI-NOT: v_cmpx +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @gs_const() { + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + %c1 = fcmp oge float %tmp1, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + %c2 = fcmp oge float %tmp3, 0.0 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + %c1 = fcmp oge float %arg14, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; SI-LABEL: {{^}}true: +; SI-NEXT: BB# +; SI-NEXT: BB# +; SI-NEXT: s_endpgm +define amdgpu_gs void @true() { + call void @llvm.amdgcn.kill(i1 true) + ret void +} + +; SI-LABEL: {{^}}false: +; SI-NOT: v_cmpx +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @false() { + call void @llvm.amdgcn.kill(i1 false) + ret void +} + +; SI-LABEL: {{^}}and: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_or_b64 s[0:1] +; SI: s_and_b64 exec, exec, s[0:1] +define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) { + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = or i1 %c1, %c2 + call void @llvm.amdgcn.kill(i1 %x) + ret void +} + +; SI-LABEL: {{^}}andn2: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_xor_b64 s[0:1] +; SI: s_andn2_b64 exec, exec, s[0:1] +define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) { + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = xor i1 %c1, %c2 + %y = xor i1 %x, 1 + call void @llvm.amdgcn.kill(i1 %y) + ret void +} + +; SI-LABEL: {{^}}oeq: +; SI: v_cmpx_eq_f32 +; SI-NOT: s_and +define amdgpu_gs void @oeq(float %a) { + %c1 = fcmp oeq float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ogt: +; SI: v_cmpx_lt_f32 +; SI-NOT: s_and +define amdgpu_gs void @ogt(float %a) { + %c1 = fcmp ogt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}oge: +; SI: v_cmpx_le_f32 +; SI-NOT: s_and +define amdgpu_gs void @oge(float %a) { + %c1 = fcmp oge float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}olt: +; SI: v_cmpx_gt_f32 +; SI-NOT: s_and +define amdgpu_gs void @olt(float %a) { + %c1 = fcmp olt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ole: +; SI: v_cmpx_ge_f32 +; SI-NOT: s_and +define amdgpu_gs void @ole(float %a) { + %c1 = fcmp ole float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}one: +; SI: v_cmpx_lg_f32 +; SI-NOT: s_and +define amdgpu_gs void @one(float %a) { + %c1 = fcmp one float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ord: +; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. +; SI: v_cmp_o_f32 +define amdgpu_gs void @ord(float %a) { + %c1 = fcmp ord float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}uno: +; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. +; SI: v_cmp_u_f32 +define amdgpu_gs void @uno(float %a) { + %c1 = fcmp uno float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ueq: +; SI: v_cmpx_nlg_f32 +; SI-NOT: s_and +define amdgpu_gs void @ueq(float %a) { + %c1 = fcmp ueq float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ugt: +; SI: v_cmpx_nge_f32 +; SI-NOT: s_and +define amdgpu_gs void @ugt(float %a) { + %c1 = fcmp ugt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}uge: +; SI: v_cmpx_ngt_f32_e32 vcc, -1.0 +; SI-NOT: s_and +define amdgpu_gs void @uge(float %a) { + %c1 = fcmp uge float %a, -1.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ult: +; SI: v_cmpx_nle_f32_e32 vcc, -2.0 +; SI-NOT: s_and +define amdgpu_gs void @ult(float %a) { + %c1 = fcmp ult float %a, -2.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ule: +; SI: v_cmpx_nlt_f32_e32 vcc, 2.0 +; SI-NOT: s_and +define amdgpu_gs void @ule(float %a) { + %c1 = fcmp ule float %a, 2.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}une: +; SI: v_cmpx_neq_f32_e32 vcc, 0 +; SI-NOT: s_and +define amdgpu_gs void @une(float %a) { + %c1 = fcmp une float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}neg_olt: +; SI: v_cmpx_ngt_f32_e32 vcc, 1.0 +; SI-NOT: s_and +define amdgpu_gs void @neg_olt(float %a) { + %c1 = fcmp olt float %a, 1.0 + %c2 = xor i1 %c1, 1 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}fcmp_x2: +; FIXME: LLVM should be able to combine these fcmp opcodes. +; SI: v_cmp_gt_f32 +; SI: v_cndmask_b32 +; SI: v_cmpx_le_f32 +define amdgpu_ps void @fcmp_x2(float %a) #0 { + %ogt = fcmp nsz ogt float %a, 2.500000e-01 + %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 + %c = fcmp nsz oge float %k, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %c) #1 + ret void +} + +; SI-LABEL: {{^}}wqm: +; SI: v_cmp_neq_f32_e32 vcc, 0 +; SI: s_wqm_b64 s[0:1], vcc +; SI: s_and_b64 exec, exec, s[0:1] +define amdgpu_ps void @wqm(float %a) { + %c1 = fcmp une float %a, 0.0 + %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +declare void @llvm.amdgcn.kill(i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i1 @llvm.amdgcn.wqm.vote(i1) + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 921f9581e39..f82bf81fbbf 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1570,4 +1570,19 @@ main_body: ret float %r } +; -------------------------------------------------------------------- +; llvm.amdgcn.kill +; -------------------------------------------------------------------- + +declare void @llvm.amdgcn.kill(i1) + +; CHECK-LABEL: @kill_true() { +; CHECK-NEXT: ret void +; CHECK-NEXT: } +define void @kill_true() { + call void @llvm.amdgcn.kill(i1 true) + ret void +} + + ; CHECK: attributes #5 = { convergent } |