diff options
author | Chandler Carruth <chandlerc@gmail.com> | 2018-07-24 00:21:59 +0000 |
---|---|---|
committer | Chandler Carruth <chandlerc@gmail.com> | 2018-07-24 00:21:59 +0000 |
commit | b46c22de006a2c48774c4d449b18fda2e07ad948 (patch) | |
tree | 4beb51222bda2130bfa70e995232ce3bb42d2192 /llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp | |
parent | 7adcf292a1b151d25accced68d3a12b8e2e0c68c (diff) | |
download | bcm5719-llvm-b46c22de006a2c48774c4d449b18fda2e07ad948.tar.gz bcm5719-llvm-b46c22de006a2c48774c4d449b18fda2e07ad948.zip |
[x86/SLH] Remove complex SHRX-based post-load hardening.
This code was really nasty, had several bugs in it originally, and
wasn't carrying its weight. While on Zen we have all 4 ports available
for SHRX, on all of the Intel parts with Agner's tables, SHRX can only
execute on 2 ports, giving it 1/2 the throughput of OR.
Worse, all too often this pattern required two SHRX instructions in
a chain, hurting the critical path by a lot.
Even if we end up needing to safe/restore EFLAGS, that is no longer so
bad. We pay for a uop to save the flag, but we very likely get fusion
when it is used by forming a test/jCC pair or something similar. In
practice, I don't expect the SHRX to be a significant savings here, so
I'd like to avoid the complex code required. We can always resurrect
this if/when someone has a specific performance issue addressed by it.
llvm-svn: 337781
Diffstat (limited to 'llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp | 83 |
1 files changed, 10 insertions, 73 deletions
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 3d2bbd3e55c..93dcf95bebd 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -1907,81 +1907,18 @@ void X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) { auto InsertPt = std::next(MI.getIterator()); unsigned FlagsReg = 0; - bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI); - if (EFLAGSLive && !Subtarget->hasBMI2()) { + if (isEFLAGSLive(MBB, InsertPt, *TRI)) FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); - EFLAGSLive = false; - } - if (!EFLAGSLive) { - unsigned StateReg = GetStateRegInRC(*DefRC); - unsigned NewDefReg = MRI->createVirtualRegister(DefRC); - DefOp.setReg(NewDefReg); - auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg) - .addReg(StateReg) - .addReg(NewDefReg); - OrI->addRegisterDead(X86::EFLAGS, TRI); - ++NumInstsInserted; - LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n"); - } else { - assert(Subtarget->hasBMI2() && - "Cannot harden loads and preserve EFLAGS without BMI2!"); - - unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr; - auto &ShiftRC = - DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass; - int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8; - unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)]; - - unsigned StateReg = GetStateRegInRC(ShiftRC); - - // First have the def instruction def a temporary register. - unsigned TmpReg = MRI->createVirtualRegister(DefRC); - DefOp.setReg(TmpReg); - // Now copy it into a register of the shift RC. - unsigned ShiftInputReg = TmpReg; - if (DefRegBytes != ShiftRegBytes) { - unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC); - BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg); - ShiftInputReg = MRI->createVirtualRegister(&ShiftRC); - BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg) - .addReg(UndefReg) - .addReg(TmpReg) - .addImm(DefSubRegImm); - } - - // We shift this once if the shift is wider than the def and thus we can - // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts. - - unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC); - auto Shift1I = - BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg) - .addReg(ShiftInputReg) - .addReg(StateReg); - (void)Shift1I; - ++NumInstsInserted; - LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n"); - - // The only way we have a bit left is if all 8 bytes were defined. Do an - // extra shift to get the last bit in this case. - if (DefRegBytes == ShiftRegBytes) { - // We can just directly def the old def register as its the same size. - ShiftInputReg = ShiftedReg; - auto Shift2I = - BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg) - .addReg(ShiftInputReg) - .addReg(StateReg); - (void)Shift2I; - ++NumInstsInserted; - LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump(); - dbgs() << "\n"); - } else { - // When we have different size shift register we need to fix up the - // class. We can do that as we copy into the old def register. - BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg) - .addReg(ShiftedReg, 0, DefSubRegImm); - } - } + unsigned StateReg = GetStateRegInRC(*DefRC); + unsigned NewDefReg = MRI->createVirtualRegister(DefRC); + DefOp.setReg(NewDefReg); + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg) + .addReg(StateReg) + .addReg(NewDefReg); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n"); if (FlagsReg) restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg); |