summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp187
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td45
4 files changed, 214 insertions, 34 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 36e81ac78a1..282d1c11833 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -135,6 +135,12 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{ return N->hasOneUse(); }]
+>;
+
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -165,6 +171,8 @@ def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def not_oneuse : HasOneUseUnaryOp<not>;
+
def add_oneuse : HasOneUseBinOp<add>;
def sub_oneuse : HasOneUseBinOp<sub>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 978677ba7b0..5d2ca05ec5b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3199,6 +3199,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+ case AMDGPU::S_XNOR_B32:
+ return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
@@ -4166,22 +4168,47 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
// Default handling
break;
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NAND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_XNOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ANDN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ORN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
continue;
@@ -4262,8 +4289,23 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
continue;
- case AMDGPU::S_XNOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ case AMDGPU::S_NAND_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOR_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ANDN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ORN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
Inst.eraseFromParent();
continue;
@@ -4549,23 +4591,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
-
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.hasDLInsts()) {
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
.add(Src0)
.add(Src1);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
} else {
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
+ // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
+ // invert either source and then perform the XOR. If either source is a
+ // scalar register, then we can leave the inversion on the scalar unit to
+ // acheive a better distrubution of scalar and vector instructions.
+ bool Src0IsSGPR = Src0.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
+ bool Src1IsSGPR = Src1.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
+ MachineInstr *Not = nullptr;
+ MachineInstr *Xor = nullptr;
+ unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Build a pair of scalar instructions and add them to the work list.
+ // The next iteration over the work list will lower these to the vector
+ // unit as necessary.
+ if (Src0IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src0);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .addReg(Temp)
.add(Src1);
+ } else if (Src1IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src1);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .add(Src0)
+ .addReg(Temp);
+ } else {
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
+ .add(Src0)
+ .add(Src1);
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Temp);
+ Worklist.insert(Not);
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+ Worklist.insert(Xor);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
- .addReg(Xor);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
+}
+
+void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
+ .add(Src0)
+ .add(Src1);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Interm);
+
+ Worklist.insert(&Op);
+ Worklist.insert(&Not);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
+ .add(Src1);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
+ .add(Src0)
+ .addReg(Interm);
+
+ Worklist.insert(&Not);
+ Worklist.insert(&Op);
MRI.replaceRegWith(Dest.getReg(), NewDest);
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
@@ -4598,13 +4733,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
+ MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -4615,6 +4750,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
+
// We don't need to legalizeOperands here because for a single operand, src0
// will support any kind of input.
@@ -4720,6 +4858,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
AMDGPU::sub0, Src0SubRC);
MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub0, Src1SubRC);
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
@@ -4730,11 +4872,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
.add(SrcReg0Sub0)
.add(SrcReg1Sub0);
- MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
- AMDGPU::sub1, Src0SubRC);
- MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
- AMDGPU::sub1, Src1SubRC);
-
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.add(SrcReg0Sub1)
@@ -4749,10 +4886,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- legalizeOperands(LoHalf, MDT);
- legalizeOperands(HiHalf, MDT);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2f222e66f6d..2c18455ac55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -89,6 +89,14 @@ private:
void lowerScalarXnor(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
+ void splitScalarBinOpN2(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 5841dcb2b9c..ca5e981ac5c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -336,6 +336,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
"$sdst, $src0, $src1", pattern
>;
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return !N->isDivergent(); }]
+>;
+
class UniformBinFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0, node:$src1),
(Op $src0, $src1),
@@ -421,16 +427,39 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
[(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
>;
+
+def S_NAND_B32 : SOP2_32 <"s_nand_b32",
+ [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NAND_B64 : SOP2_64 <"s_nand_b64",
+ [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+>;
+
+def S_NOR_B32 : SOP2_32 <"s_nor_b32",
+ [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NOR_B64 : SOP2_64 <"s_nor_b64",
+ [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+>;
} // End isCommutable = 1
-def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
-def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
-def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
-def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
-def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
-def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
-def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
-def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
+
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
OpenPOWER on IntegriCloud