summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp187
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td45
-rw-r--r--llvm/test/CodeGen/AMDGPU/andorn2.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fceil64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/nand.ll83
-rw-r--r--llvm/test/CodeGen/AMDGPU/nor.ll83
-rw-r--r--llvm/test/CodeGen/AMDGPU/xnor.ll33
12 files changed, 518 insertions, 46 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 36e81ac78a1..282d1c11833 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -135,6 +135,12 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{ return N->hasOneUse(); }]
+>;
+
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -165,6 +171,8 @@ def or_oneuse : HasOneUseBinOp<or>;
def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
+def not_oneuse : HasOneUseUnaryOp<not>;
+
def add_oneuse : HasOneUseBinOp<add>;
def sub_oneuse : HasOneUseBinOp<sub>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 978677ba7b0..5d2ca05ec5b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3199,6 +3199,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
+ case AMDGPU::S_XNOR_B32:
+ return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
@@ -4166,22 +4168,47 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
// Default handling
break;
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NAND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_XNOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ANDN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ORN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
Inst.eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
Inst.eraseFromParent();
continue;
@@ -4262,8 +4289,23 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
Inst.eraseFromParent();
continue;
- case AMDGPU::S_XNOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ case AMDGPU::S_NAND_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_NOR_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ANDN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ continue;
+
+ case AMDGPU::S_ORN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
Inst.eraseFromParent();
continue;
@@ -4549,23 +4591,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
- legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
-
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
if (ST.hasDLInsts()) {
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
+ legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
+
BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
.add(Src0)
.add(Src1);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
} else {
- unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor)
- .add(Src0)
+ // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
+ // invert either source and then perform the XOR. If either source is a
+ // scalar register, then we can leave the inversion on the scalar unit to
+ // acheive a better distrubution of scalar and vector instructions.
+ bool Src0IsSGPR = Src0.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
+ bool Src1IsSGPR = Src1.isReg() &&
+ RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
+ MachineInstr *Not = nullptr;
+ MachineInstr *Xor = nullptr;
+ unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ // Build a pair of scalar instructions and add them to the work list.
+ // The next iteration over the work list will lower these to the vector
+ // unit as necessary.
+ if (Src0IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src0);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .addReg(Temp)
.add(Src1);
+ } else if (Src1IsSGPR) {
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
+ .add(Src1);
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
+ .add(Src0)
+ .addReg(Temp);
+ } else {
+ Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
+ .add(Src0)
+ .add(Src1);
+ Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Temp);
+ Worklist.insert(Not);
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+
+ Worklist.insert(Xor);
- BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest)
- .addReg(Xor);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
+}
+
+void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
+ .add(Src0)
+ .add(Src1);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
+ .addReg(Interm);
+
+ Worklist.insert(&Op);
+ Worklist.insert(&Not);
+
+ MRI.replaceRegWith(Dest.getReg(), NewDest);
+ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
+}
+
+void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+
+ unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
+ .add(Src1);
+
+ MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
+ .add(Src0)
+ .addReg(Interm);
+
+ Worklist.insert(&Not);
+ Worklist.insert(&Op);
MRI.replaceRegWith(Dest.getReg(), NewDest);
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
@@ -4598,13 +4733,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
+ MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
- BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
+ MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -4615,6 +4750,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
+
// We don't need to legalizeOperands here because for a single operand, src0
// will support any kind of input.
@@ -4720,6 +4858,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
AMDGPU::sub0, Src0SubRC);
MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub0, Src1SubRC);
+ MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
+ AMDGPU::sub1, Src0SubRC);
+ MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
+ AMDGPU::sub1, Src1SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
@@ -4730,11 +4872,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
.add(SrcReg0Sub0)
.add(SrcReg1Sub0);
- MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
- AMDGPU::sub1, Src0SubRC);
- MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
- AMDGPU::sub1, Src1SubRC);
-
unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.add(SrcReg0Sub1)
@@ -4749,10 +4886,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- legalizeOperands(LoHalf, MDT);
- legalizeOperands(HiHalf, MDT);
+ Worklist.insert(&LoHalf);
+ Worklist.insert(&HiHalf);
// Move all users of this moved vlaue.
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2f222e66f6d..2c18455ac55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -89,6 +89,14 @@ private:
void lowerScalarXnor(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarNotBinop(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
+ void splitScalarBinOpN2(SetVectorType &Worklist,
+ MachineInstr &Inst,
+ unsigned Opcode) const;
+
void splitScalar64BitUnaryOp(SetVectorType &Worklist,
MachineInstr &Inst, unsigned Opcode) const;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 5841dcb2b9c..ca5e981ac5c 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -336,6 +336,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
"$sdst, $src0, $src1", pattern
>;
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+ (ops node:$src0),
+ (Op $src0),
+ [{ return !N->isDivergent(); }]
+>;
+
class UniformBinFrag<SDPatternOperator Op> : PatFrag <
(ops node:$src0, node:$src1),
(Op $src0, $src1),
@@ -421,16 +427,39 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
[(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
>;
+
+def S_NAND_B32 : SOP2_32 <"s_nand_b32",
+ [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NAND_B64 : SOP2_64 <"s_nand_b64",
+ [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))]
+>;
+
+def S_NOR_B32 : SOP2_32 <"s_nor_b32",
+ [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))]
+>;
+
+def S_NOR_B64 : SOP2_64 <"s_nor_b64",
+ [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))]
+>;
} // End isCommutable = 1
-def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">;
-def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">;
-def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">;
-def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">;
-def S_NAND_B32 : SOP2_32 <"s_nand_b32">;
-def S_NAND_B64 : SOP2_64 <"s_nand_b64">;
-def S_NOR_B32 : SOP2_32 <"s_nor_b32">;
-def S_NOR_B64 : SOP2_64 <"s_nor_b64">;
+def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
+
+def S_ORN2_B32 : SOP2_32 <"s_orn2_b32",
+ [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))]
+>;
+
+def S_ORN2_B64 : SOP2_64 <"s_orn2_b64",
+ [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))]
+>;
} // End Defs = [SCC]
// Use added complexity so these patterns are preferred to the VALU patterns.
diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
new file mode 100644
index 00000000000..390c103f367
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_andn2_i32_one_use
+; GCN: s_andn2_b32
+define amdgpu_kernel void @scalar_andn2_i32_one_use(
+ i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = and i32 %a, %nb
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_andn2_i64_one_use
+; GCN: s_andn2_b64
+define amdgpu_kernel void @scalar_andn2_i64_one_use(
+ i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = and i64 %a, %nb
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
+; GCN: s_orn2_b32
+define amdgpu_kernel void @scalar_orn2_i32_one_use(
+ i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+ %nb = xor i32 %b, -1
+ %r0.val = or i32 %a, %nb
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_orn2_i64_one_use
+; GCN: s_orn2_b64
+define amdgpu_kernel void @scalar_orn2_i64_one_use(
+ i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+ %nb = xor i64 %b, -1
+ %r0.val = or i64 %a, %nb
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
+; GCN: v_not_b32
+; GCN: v_and_b32
+define amdgpu_kernel void @vector_andn2_i32_s_v_one_use(
+ i32 addrspace(1)* %r0, i32 %s) {
+entry:
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %not = xor i32 %v, -1
+ %r0.val = and i32 %s, %not
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use
+; GCN: s_not_b32
+; GCN: v_and_b32
+define amdgpu_kernel void @vector_andn2_i32_v_s_one_use(
+ i32 addrspace(1)* %r0, i32 %s) {
+entry:
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %not = xor i32 %s, -1
+ %r0.val = and i32 %v, %not
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use
+; GCN: v_not_b32
+; GCN: v_or_b32
+define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(
+ i32 addrspace(1)* %r0, i32 %s) {
+entry:
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %not = xor i32 %v, -1
+ %r0.val = or i32 %s, %not
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use
+; GCN: s_not_b32
+; GCN: v_or_b32
+define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(
+ i32 addrspace(1)* %r0, i32 %s) {
+entry:
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %not = xor i32 %s, -1
+ %r0.val = or i32 %v, %not
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 09d4b2c8bd7..8611cd080e1 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -107,7 +107,7 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]]
-; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]], v[[VREG1_LO]]
+; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index ba26ed23832..da852af3f23 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -17,8 +17,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; are not always followed.
; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
-; SI-DAG: s_not_b64
-; SI-DAG: s_and_b64
+; SI-DAG: s_andn2_b64
; SI-DAG: cmp_gt_i32
; SI-DAG: cndmask_b32
; SI-DAG: cndmask_b32
diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 6fc4c8b7d24..226125335c3 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -27,8 +27,7 @@ define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrsp
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
-; SI-DAG: s_not_b64
-; SI-DAG: s_and_b64
+; SI-DAG: s_andn2_b64
; SI-DAG: cmp_gt_i32
; SI-DAG: cndmask_b32
; SI-DAG: cndmask_b32
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index aca576cbde6..cf19486dfca 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -266,8 +266,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
-; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
-; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
+; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]], [[SHIFTED_MASK]]
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
@@ -306,8 +305,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
-; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
-; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]]
+; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
diff --git a/llvm/test/CodeGen/AMDGPU/nand.ll b/llvm/test/CodeGen/AMDGPU/nand.ll
new file mode 100644
index 00000000000..be7d9f677ec
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nand.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_nand_i32_one_use
+; GCN: s_nand_b32
+define amdgpu_kernel void @scalar_nand_i32_one_use(
+ i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+ %and = and i32 %a, %b
+ %r0.val = xor i32 %and, -1
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i32_mul_use
+; GCN-NOT: s_nand_b32
+; GCN: s_and_b32
+; GCN: s_not_b32
+; GCN: s_add_i32
+define amdgpu_kernel void @scalar_nand_i32_mul_use(
+ i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+entry:
+ %and = and i32 %a, %b
+ %r0.val = xor i32 %and, -1
+ %r1.val = add i32 %and, %a
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ store i32 %r1.val, i32 addrspace(1)* %r1
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i64_one_use
+; GCN: s_nand_b64
+define amdgpu_kernel void @scalar_nand_i64_one_use(
+ i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+ %and = and i64 %a, %b
+ %r0.val = xor i64 %and, -1
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nand_i64_mul_use
+; GCN-NOT: s_nand_b64
+; GCN: s_and_b64
+; GCN: s_not_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+define amdgpu_kernel void @scalar_nand_i64_mul_use(
+ i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+entry:
+ %and = and i64 %a, %b
+ %r0.val = xor i64 %and, -1
+ %r1.val = add i64 %and, %a
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ store i64 %r1.val, i64 addrspace(1)* %r1
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_nand_i32_one_use
+; GCN-NOT: s_nand_b32
+; GCN: v_and_b32
+; GCN: v_not_b32
+define i32 @vector_nand_i32_one_use(i32 %a, i32 %b) {
+entry:
+ %and = and i32 %a, %b
+ %r = xor i32 %and, -1
+ ret i32 %r
+}
+
+; GCN-LABEL: {{^}}vector_nand_i64_one_use
+; GCN-NOT: s_nand_b64
+; GCN: v_and_b32
+; GCN: v_and_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
+define i64 @vector_nand_i64_one_use(i64 %a, i64 %b) {
+entry:
+ %and = and i64 %a, %b
+ %r = xor i64 %and, -1
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/nor.ll b/llvm/test/CodeGen/AMDGPU/nor.ll
new file mode 100644
index 00000000000..8fddd39cad3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nor.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX600 %s
+; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s
+; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX801 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s
+
+; GCN-LABEL: {{^}}scalar_nor_i32_one_use
+; GCN: s_nor_b32
+define amdgpu_kernel void @scalar_nor_i32_one_use(
+ i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+entry:
+ %or = or i32 %a, %b
+ %r0.val = xor i32 %or, -1
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i32_mul_use
+; GCN-NOT: s_nor_b32
+; GCN: s_or_b32
+; GCN: s_not_b32
+; GCN: s_add_i32
+define amdgpu_kernel void @scalar_nor_i32_mul_use(
+ i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+entry:
+ %or = or i32 %a, %b
+ %r0.val = xor i32 %or, -1
+ %r1.val = add i32 %or, %a
+ store i32 %r0.val, i32 addrspace(1)* %r0
+ store i32 %r1.val, i32 addrspace(1)* %r1
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i64_one_use
+; GCN: s_nor_b64
+define amdgpu_kernel void @scalar_nor_i64_one_use(
+ i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+entry:
+ %or = or i64 %a, %b
+ %r0.val = xor i64 %or, -1
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ ret void
+}
+
+; GCN-LABEL: {{^}}scalar_nor_i64_mul_use
+; GCN-NOT: s_nor_b64
+; GCN: s_or_b64
+; GCN: s_not_b64
+; GCN: s_add_u32
+; GCN: s_addc_u32
+define amdgpu_kernel void @scalar_nor_i64_mul_use(
+ i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+entry:
+ %or = or i64 %a, %b
+ %r0.val = xor i64 %or, -1
+ %r1.val = add i64 %or, %a
+ store i64 %r0.val, i64 addrspace(1)* %r0
+ store i64 %r1.val, i64 addrspace(1)* %r1
+ ret void
+}
+
+; GCN-LABEL: {{^}}vector_nor_i32_one_use
+; GCN-NOT: s_nor_b32
+; GCN: v_or_b32
+; GCN: v_not_b32
+define i32 @vector_nor_i32_one_use(i32 %a, i32 %b) {
+entry:
+ %or = or i32 %a, %b
+ %r = xor i32 %or, -1
+ ret i32 %r
+}
+
+; GCN-LABEL: {{^}}vector_nor_i64_one_use
+; GCN-NOT: s_nor_b64
+; GCN: v_or_b32
+; GCN: v_or_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
+define i64 @vector_nor_i64_one_use(i64 %a, i64 %b) {
+entry:
+ %or = or i64 %a, %b
+ %r = xor i64 %or, -1
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index 0371cc68f04..103cb3487ca 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -61,8 +61,8 @@ entry:
; GCN-LABEL: {{^}}vector_xnor_i32_one_use
; GCN-NOT: s_xnor_b32
-; GCN: v_xor_b32
; GCN: v_not_b32
+; GCN: v_xor_b32
; GCN-DL: v_xnor_b32
define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
entry:
@@ -73,10 +73,10 @@ entry:
; GCN-LABEL: {{^}}vector_xnor_i64_one_use
; GCN-NOT: s_xnor_b64
-; GCN: v_xor_b32
-; GCN: v_xor_b32
; GCN: v_not_b32
+; GCN: v_xor_b32
; GCN: v_not_b32
+; GCN: v_xor_b32
; GCN-DL: v_xnor_b32
; GCN-DL: v_xnor_b32
define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
@@ -85,3 +85,30 @@ entry:
%r = xor i64 %xor, -1
ret i64 %r
}
+
+; GCN-LABEL: {{^}}xnor_s_v_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: s_not_b32
+; GCN: v_xor_b32
+define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %xor = xor i32 %s, %v
+ %d = xor i32 %xor, -1
+ store i32 %d, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}xnor_v_s_i32_one_use
+; GCN-NOT: s_xnor_b32
+; GCN: s_not_b32
+; GCN: v_xor_b32
+define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s) {
+ %v = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %xor = xor i32 %v, %s
+ %d = xor i32 %xor, -1
+ store i32 %d, i32 addrspace(1)* %out
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.x() #0
OpenPOWER on IntegriCloud