diff options
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 187 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 45 |
4 files changed, 214 insertions, 34 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 36e81ac78a1..282d1c11833 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -135,6 +135,12 @@ def brtarget : Operand<OtherVT>; // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp<SDPatternOperator op> : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -165,6 +171,8 @@ def or_oneuse : HasOneUseBinOp<or>; def xor_oneuse : HasOneUseBinOp<xor>; } // Properties = [SDNPCommutative, SDNPAssociative] +def not_oneuse : HasOneUseUnaryOp<not>; + def add_oneuse : HasOneUseBinOp<add>; def sub_oneuse : HasOneUseBinOp<sub>; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 978677ba7b0..5d2ca05ec5b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3199,6 +3199,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_XNOR_B32: + return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; @@ -4166,22 +4168,47 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, // Default handling break; case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_XNOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); continue; @@ -4262,8 +4289,23 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, Inst.eraseFromParent(); continue; - case AMDGPU::S_XNOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; @@ -4549,23 +4591,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.hasDLInsts()) { + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) .add(Src0) .add(Src1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } else { - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) + // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can + // invert either source and then perform the XOR. If either source is a + // scalar register, then we can leave the inversion on the scalar unit to + // acheive a better distrubution of scalar and vector instructions. + bool Src0IsSGPR = Src0.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); + bool Src1IsSGPR = Src1.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); + MachineInstr *Not = nullptr; + MachineInstr *Xor = nullptr; + unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Build a pair of scalar instructions and add them to the work list. + // The next iteration over the work list will lower these to the vector + // unit as necessary. + if (Src0IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src0); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .addReg(Temp) .add(Src1); + } else if (Src1IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src1); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .add(Src0) + .addReg(Temp); + } else { + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) + .add(Src0) + .add(Src1); + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Temp); + Worklist.insert(Not); + } + + MRI.replaceRegWith(Dest.getReg(), NewDest); + + Worklist.insert(Xor); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) - .addReg(Xor); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } +} + +void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) + .add(Src0) + .add(Src1); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Interm); + + Worklist.insert(&Op); + Worklist.insert(&Not); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); +} + +void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) + .add(Src1); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) + .add(Src0) + .addReg(Interm); + + Worklist.insert(&Not); + Worklist.insert(&Op); MRI.replaceRegWith(Dest.getReg(), NewDest); addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); @@ -4598,13 +4733,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -4615,6 +4750,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 // will support any kind of input. @@ -4720,6 +4858,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, AMDGPU::sub0, Src0SubRC); MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); @@ -4730,11 +4872,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, .add(SrcReg0Sub0) .add(SrcReg1Sub0); - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) @@ -4749,10 +4886,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - legalizeOperands(LoHalf, MDT); - legalizeOperands(HiHalf, MDT); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2f222e66f6d..2c18455ac55 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -89,6 +89,14 @@ private: void lowerScalarXnor(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + + void splitScalarBinOpN2(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 5841dcb2b9c..ca5e981ac5c 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -336,6 +336,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < "$sdst, $src0, $src1", pattern >; +class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }] +>; + class UniformBinFrag<SDPatternOperator Op> : PatFrag < (ops node:$src0, node:$src1), (Op $src0, $src1), @@ -421,16 +427,39 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] >; + +def S_NAND_B32 : SOP2_32 <"s_nand_b32", + [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NAND_B64 : SOP2_64 <"s_nand_b64", + [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))] +>; + +def S_NOR_B32 : SOP2_32 <"s_nor_b32", + [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NOR_B64 : SOP2_64 <"s_nor_b64", + [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))] +>; } // End isCommutable = 1 -def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; -def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">; -def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">; -def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">; -def S_NAND_B32 : SOP2_32 <"s_nand_b32">; -def S_NAND_B64 : SOP2_64 <"s_nand_b64">; -def S_NOR_B32 : SOP2_32 <"s_nor_b32">; -def S_NOR_B64 : SOP2_64 <"s_nor_b64">; +def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32", + [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))] +>; + +def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64", + [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))] +>; + +def S_ORN2_B32 : SOP2_32 <"s_orn2_b32", + [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))] +>; + +def S_ORN2_B64 : SOP2_64 <"s_orn2_b64", + [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))] +>; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. |

