diff options
author | Graham Sellers <graham.sellers@amd.com> | 2018-12-07 15:33:21 +0000 |
---|---|---|
committer | Graham Sellers <graham.sellers@amd.com> | 2018-12-07 15:33:21 +0000 |
commit | b297379ef07829ac7f06c0e2058a889366c46a82 (patch) | |
tree | f1f6b124bb90dfa8f870194377314a7ae57d0c40 /llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | |
parent | 248ed074199e4aa65c706a11033de88da883856d (diff) | |
download | bcm5719-llvm-b297379ef07829ac7f06c0e2058a889366c46a82.tar.gz bcm5719-llvm-b297379ef07829ac7f06c0e2058a889366c46a82.zip |
[AMDGPU] Shrink scalar AND, OR, XOR instructions
This change attempts to shrink scalar AND, OR and XOR instructions which take an immediate that isn't inlineable.
It performs:
AND s0, s0, ~(1 << n) -> BITSET0 s0, n
OR s0, s0, (1 << n) -> BITSET1 s0, n
AND s0, s1, x -> ANDN2 s0, s1, ~x
OR s0, s1, x -> ORN2 s0, s1, ~x
XOR s0, s1, x -> XNOR s0, s1, ~x
In particular, this catches setting and clearing the sign bit for fabs (and x, 0x7ffffffff -> bitset0 x, 31 and or x, 0x80000000 -> bitset1 x, 31).
llvm-svn: 348601
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 015773b1104..6ad7dd0e3a7 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -212,6 +212,82 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } } +/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. +/// For AND or OR, try using S_BITSET{0,1} to clear or set bits. +/// If the inverse of the immediate is legal, use ANDN2, ORN2 or +/// XNOR (as a ^ b == ~(a ^ ~b)). +/// \returns true if the caller should continue the machine function iterator +static bool shrinkScalarLogicOp(const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + MachineOperand *SrcReg = Src0; + MachineOperand *SrcImm = Src1; + + if (SrcImm->isImm() && + !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { + uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); + uint32_t NewImm = 0; + + if (Opc == AMDGPU::S_AND_B32) { + if (isPowerOf2_32(~Imm)) { + NewImm = countTrailingOnes(Imm); + Opc = AMDGPU::S_BITSET0_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ANDN2_B32; + } + } else if (Opc == AMDGPU::S_OR_B32) { + if (isPowerOf2_32(Imm)) { + NewImm = countTrailingZeros(Imm); + Opc = AMDGPU::S_BITSET1_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ORN2_B32; + } + } else if (Opc == AMDGPU::S_XOR_B32) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_XNOR_B32; + } + } else { + llvm_unreachable("unexpected opcode"); + } + + if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && + SrcImm == Src0) { + if (!TII->commuteInstruction(MI, false, 1, 2)) + NewImm = 0; + } + + if (NewImm != 0) { + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + SrcReg->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + return true; + } + + if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { + MI.setDesc(TII->get(Opc)); + if (Opc == AMDGPU::S_BITSET0_B32 || + Opc == AMDGPU::S_BITSET1_B32) { + Src0->ChangeToImmediate(NewImm); + MI.RemoveOperand(2); + } else { + SrcImm->setImm(NewImm); + } + } + } + } + + return false; +} + // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, @@ -512,6 +588,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; |