8 files changed, 755 insertions, 117 deletions
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index f8fad028aa9..168a4513b96 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -720,6 +720,14 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   removeSuccessor(OldI);
 }
 
+void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig,
+                                      succ_iterator I) {
+  if (Orig->Probs.empty())
+    addSuccessor(*I, Orig->getSuccProbability(I));
+  else
+    addSuccessorWithoutProb(*I);
+}
+
 void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) {
   Predecessors.push_back(Pred);
 }
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index a4be362452f..d4bfe510547 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -34,6 +34,7 @@ set(sources
   X86FixupLEAs.cpp
   X86AvoidStoreForwardingBlocks.cpp
   X86FixupSetCC.cpp
+  X86FlagsCopyLowering.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86InstructionSelector.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 90eac737eaa..4a46e36e47b 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -78,6 +78,9 @@ FunctionPass *createX86FixupSetCC();
 /// Return a pass that avoids creating store forward block issues in the hardware.
 FunctionPass *createX86AvoidStoreForwardingBlocks();
 
+/// Return a pass that lowers EFLAGS copy pseudo instructions.
+FunctionPass *createX86FlagsCopyLoweringPass();
+
 /// Return a pass that expands WinAlloca pseudo-instructions.
 FunctionPass *createX86WinAllocaExpander();
 
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
new file mode 100644
index 00000000000..1f4bd7fd501
--- /dev/null
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -0,0 +1,734 @@
+//====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual
+/// flag bits.
+///
+/// We have to do this by carefully analyzing and rewriting the usage of the
+/// copied EFLAGS register because there is no general way to rematerialize the
+/// entire EFLAGS register safely and efficiently. Using `popf` both forces
+/// dynamic stack adjustment and can create correctness issues due to IF, TF,
+/// and other non-status flags being overwritten. Using sequences involving
+/// SAHF don't work on all x86 processors and are often quite slow compared to
+/// directly testing a single status preserved in its own GPR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-flags-copy-lowering"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated");
+STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted");
+STATISTIC(NumTestsInserted, "Number of test instructions inserted");
+STATISTIC(NumAddsInserted, "Number of adds instructions inserted");
+
+namespace llvm {
+
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+
+} // end namespace llvm
+
+namespace {
+
+// Convenient array type for storing registers associated with each condition.
+using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
+
+class X86FlagsCopyLoweringPass : public MachineFunctionPass {
+public:
+  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {
+    initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+private:
+  MachineRegisterInfo *MRI;
+  const X86InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetRegisterClass *PromoteRC;
+
+  CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
+                                  MachineInstr &CopyDefI);
+
+  unsigned promoteCondToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator TestPos,
+                            DebugLoc TestLoc, X86::CondCode Cond);
+  std::pair<unsigned, bool>
+  getCondOrInverseInReg(MachineBasicBlock &TestMBB,
+                        MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                        X86::CondCode Cond, CondRegArray &CondRegs);
+  void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                  DebugLoc Loc, unsigned Reg);
+
+  void rewriteArithmetic(MachineBasicBlock &TestMBB,
+                         MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                         MachineInstr &MI, MachineOperand &FlagUse,
+                         CondRegArray &CondRegs);
+  void rewriteCMov(MachineBasicBlock &TestMBB,
+                   MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                   MachineInstr &CMovI, MachineOperand &FlagUse,
+                   CondRegArray &CondRegs);
+  void rewriteCondJmp(MachineBasicBlock &TestMBB,
+                      MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                      MachineInstr &JmpI, CondRegArray &CondRegs);
+  void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
+                   MachineInstr &CopyDefI);
+  void rewriteSetCC(MachineBasicBlock &TestMBB,
+                    MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+                    MachineInstr &SetCCI, MachineOperand &FlagUse,
+                    CondRegArray &CondRegs);
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+                      "X86 EFLAGS copy lowering", false, false)
+INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE,
+                    "X86 EFLAGS copy lowering", false, false)
+
+FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
+  return new X86FlagsCopyLoweringPass();
+}
+
+char X86FlagsCopyLoweringPass::ID = 0;
+
+void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+namespace {
+/// An enumeration of the arithmetic instruction mnemonics which have
+/// interesting flag semantics.
+///
+/// We can map instruction opcodes into these mnemonics to make it easy to
+/// dispatch with specific functionality.
+enum class FlagArithMnemonic {
+  ADC,
+  ADCX,
+  ADOX,
+  RCL,
+  RCR,
+  SBB,
+};
+} // namespace
+
+static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    report_fatal_error("No support for lowering a copy into EFLAGS when used "
+                       "by this instruction!");
+
+#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
+  case X86::MNEMONIC##8##SUFFIX:                                               \
+  case X86::MNEMONIC##16##SUFFIX:                                              \
+  case X86::MNEMONIC##32##SUFFIX:                                              \
+  case X86::MNEMONIC##64##SUFFIX:
+
+#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
+  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
+  case X86::MNEMONIC##8ri:                                                     \
+  case X86::MNEMONIC##16ri8:                                                   \
+  case X86::MNEMONIC##32ri8:                                                   \
+  case X86::MNEMONIC##64ri8:                                                   \
+  case X86::MNEMONIC##16ri:                                                    \
+  case X86::MNEMONIC##32ri:                                                    \
+  case X86::MNEMONIC##64ri32:                                                  \
+  case X86::MNEMONIC##8mi:                                                     \
+  case X86::MNEMONIC##16mi8:                                                   \
+  case X86::MNEMONIC##32mi8:                                                   \
+  case X86::MNEMONIC##64mi8:                                                   \
+  case X86::MNEMONIC##16mi:                                                    \
+  case X86::MNEMONIC##32mi:                                                    \
+  case X86::MNEMONIC##64mi32:                                                  \
+  case X86::MNEMONIC##8i8:                                                     \
+  case X86::MNEMONIC##16i16:                                                   \
+  case X86::MNEMONIC##32i32:                                                   \
+  case X86::MNEMONIC##64i32:
+
+    LLVM_EXPAND_ADC_SBB_INSTR(ADC)
+    return FlagArithMnemonic::ADC;
+
+    LLVM_EXPAND_ADC_SBB_INSTR(SBB)
+    return FlagArithMnemonic::SBB;
+
+#undef LLVM_EXPAND_ADC_SBB_INSTR
+
+    LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
+    LLVM_EXPAND_INSTR_SIZES(RCL, r1)
+    LLVM_EXPAND_INSTR_SIZES(RCL, ri)
+    return FlagArithMnemonic::RCL;
+
+    LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
+    LLVM_EXPAND_INSTR_SIZES(RCR, r1)
+    LLVM_EXPAND_INSTR_SIZES(RCR, ri)
+    return FlagArithMnemonic::RCR;
+
+#undef LLVM_EXPAND_INSTR_SIZES
+
+  case X86::ADCX32rr:
+  case X86::ADCX64rr:
+  case X86::ADCX32rm:
+  case X86::ADCX64rm:
+    return FlagArithMnemonic::ADCX;
+
+  case X86::ADOX32rr:
+  case X86::ADOX64rr:
+  case X86::ADOX32rm:
+  case X86::ADOX64rm:
+    return FlagArithMnemonic::ADOX;
+  }
+}
+
+static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
+                                     MachineInstr &SplitI,
+                                     const X86InstrInfo &TII) {
+  MachineFunction &MF = *MBB.getParent();
+
+  assert(SplitI.getParent() == &MBB &&
+         "Split instruction must be in the split block!");
+  assert(SplitI.isBranch() &&
+         "Only designed to split a tail of branch instructions!");
+  assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID &&
+         "Must split on an actual jCC instruction!");
+
+  // Dig out the previous instruction to the split point.
+  MachineInstr &PrevI = *std::prev(SplitI.getIterator());
+  assert(PrevI.isBranch() && "Must split after a branch!");
+  assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID &&
+         "Must split after an actual jCC instruction!");
+  assert(!std::prev(PrevI.getIterator())->isTerminator() &&
+         "Must only have this one terminator prior to the split!");
+
+  // Grab the one successor edge that will stay in `MBB`.
+  MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB();
+
+  // Analyze the original block to see if we are actually splitting an edge
+  // into two edges. This can happen when we have multiple conditional jumps to
+  // the same successor.
+  bool IsEdgeSplit =
+      std::any_of(SplitI.getIterator(), MBB.instr_end(),
+                  [&](MachineInstr &MI) {
+                    assert(MI.isTerminator() &&
+                           "Should only have spliced terminators!");
+                    return llvm::any_of(
+                        MI.operands(), [&](MachineOperand &MOp) {
+                          return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc;
+                        });
+                  }) ||
+      MBB.getFallThrough() == &UnsplitSucc;
+
+  MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
+
+  // Insert the new block immediately after the current one. Any existing
+  // fallthrough will be sunk into this new block anyways.
+  MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
+
+  // Splice the tail of instructions into the new block.
+  NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end());
+
+  // Copy the necessary succesors (and their probability info) into the new
+  // block.
+  for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI)
+    if (IsEdgeSplit || *SI != &UnsplitSucc)
+      NewMBB.copySuccessor(&MBB, SI);
+  // Normalize the probabilities if we didn't end up splitting the edge.
+  if (!IsEdgeSplit)
+    NewMBB.normalizeSuccProbs();
+
+  // Now replace all of the moved successors in the original block with the new
+  // block. This will merge their probabilities.
+  for (MachineBasicBlock *Succ : NewMBB.successors())
+    if (Succ != &UnsplitSucc)
+      MBB.replaceSuccessor(Succ, &NewMBB);
+
+  // We should always end up replacing at least one successor.
+  assert(MBB.isSuccessor(&NewMBB) &&
+         "Failed to make the new block a successor!");
+
+  // Now update all the PHIs.
+  for (MachineBasicBlock *Succ : NewMBB.successors()) {
+    for (MachineInstr &MI : *Succ) {
+      if (!MI.isPHI())
+        break;
+
+      for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
+           OpIdx += 2) {
+        MachineOperand &OpV = MI.getOperand(OpIdx);
+        MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
+        assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
+        if (OpMBB.getMBB() != &MBB)
+          continue;
+
+        // Replace the operand for unsplit successors
+        if (!IsEdgeSplit || Succ != &UnsplitSucc) {
+          OpMBB.setMBB(&NewMBB);
+
+          // We have to continue scanning as there may be multiple entries in
+          // the PHI.
+          continue;
+        }
+
+        // When we have split the edge append a new successor.
+        MI.addOperand(MF, OpV);
+        MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
+        break;
+      }
+    }
+  }
+
+  return NewMBB;
+}
+
+bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << " **********\n");
+
+  auto &Subtarget = MF.getSubtarget<X86Subtarget>();
+  MRI = &MF.getRegInfo();
+  TII = Subtarget.getInstrInfo();
+  TRI = Subtarget.getRegisterInfo();
+  PromoteRC = &X86::GR8RegClass;
+
+  if (MF.begin() == MF.end())
+    // Nothing to do for a degenerate empty function...
+    return false;
+
+  SmallVector<MachineInstr *, 4> Copies;
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == TargetOpcode::COPY &&
+          MI.getOperand(0).getReg() == X86::EFLAGS)
+        Copies.push_back(&MI);
+
+  for (MachineInstr *CopyI : Copies) {
+    MachineBasicBlock &MBB = *CopyI->getParent();
+
+    MachineOperand &VOp = CopyI->getOperand(1);
+    assert(VOp.isReg() &&
+           "The input to the copy for EFLAGS should always be a register!");
+    MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg());
+    if (CopyDefI.getOpcode() != TargetOpcode::COPY) {
+      // FIXME: The big likely candidate here are PHI nodes. We could in theory
+      // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard
+      // enough that it is probably better to change every other part of LLVM
+      // to avoid creating them. The issue is that once we have PHIs we won't
+      // know which original EFLAGS value we need to capture with our setCCs
+      // below. The end result will be computing a complete set of setCCs that
+      // we *might* want, computing them in every place where we copy *out* of
+      // EFLAGS and then doing SSA formation on all of them to insert necessary
+      // PHI nodes and consume those here. Then hoping that somehow we DCE the
+      // unnecessary ones. This DCE seems very unlikely to be successful and so
+      // we will almost certainly end up with a glut of dead setCC
+      // instructions. Until we have a motivating test case and fail to avoid
+      // it by changing other parts of LLVM's lowering, we refuse to handle
+      // this complex case here.
+      DEBUG(dbgs() << "ERROR: Encountered unexpected def of an eflags copy: ";
+            CopyDefI.dump());
+      report_fatal_error(
+          "Cannot lower EFLAGS copy unless it is defined in turn by a copy!");
+    }
+
+    auto Cleanup = make_scope_exit([&] {
+      // All uses of the EFLAGS copy are now rewritten, kill the copy into
+      // eflags and if dead the copy from.
+      CopyI->eraseFromParent();
+      if (MRI->use_empty(CopyDefI.getOperand(0).getReg()))
+        CopyDefI.eraseFromParent();
+      ++NumCopiesEliminated;
+    });
+
+    MachineOperand &DOp = CopyI->getOperand(0);
+    assert(DOp.isDef() && "Expected register def!");
+    assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!");
+    if (DOp.isDead())
+      continue;
+
+    MachineBasicBlock &TestMBB = *CopyDefI.getParent();
+    auto TestPos = CopyDefI.getIterator();
+    DebugLoc TestLoc = CopyDefI.getDebugLoc();
+
+    DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
+
+    // Scan for usage of newly set EFLAGS so we can rewrite them. We just buffer
+    // jumps because their usage is very constrained.
+    bool FlagsKilled = false;
+    SmallVector<MachineInstr *, 4> JmpIs;
+
+    // Gather the condition flags that have already been preserved in
+    // registers. We do this from scratch each time as we expect there to be
+    // very few of them and we expect to not revisit the same copy definition
+    // many times. If either of those change sufficiently we could build a map
+    // of these up front instead.
+    CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
+
+    for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end();
+         MII != MIE;) {
+      MachineInstr &MI = *MII++;
+      MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+      if (!FlagUse) {
+        if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+          // If EFLAGS are defined, it's as-if they were killed. We can stop
+          // scanning here.
+          //
+          // NB!!! Many instructions only modify some flags. LLVM currently
+          // models this as clobbering all flags, but if that ever changes this
+          // will need to be carefully updated to handle that more complex
+          // logic.
+          FlagsKilled = true;
+          break;
+        }
+        continue;
+      }
+
+      DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
+
+      // Check the kill flag before we rewrite as that may change it.
+      if (FlagUse->isKill())
+        FlagsKilled = true;
+
+      // Once we encounter a branch, the rest of the instructions must also be
+      // branches. We can't rewrite in place here, so we handle them below.
+      //
+      // Note that we don't have to handle tail calls here, even conditional
+      // tail calls, as those are not introduced into the X86 MI until post-RA
+      // branch folding or black placement. As a consequence, we get to deal
+      // with the simpler formulation of conditional branches followed by tail
+      // calls.
+      if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+        auto JmpIt = MI.getIterator();
+        do {
+          JmpIs.push_back(&*JmpIt);
+          ++JmpIt;
+        } while (JmpIt != MBB.instr_end() &&
+                 X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+                     X86::COND_INVALID);
+        break;
+      }
+
+      // Otherwise we can just rewrite in-place.
+      if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+        rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+      } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) {
+        rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+      } else if (MI.getOpcode() == TargetOpcode::COPY) {
+        rewriteCopy(MI, *FlagUse, CopyDefI);
+      } else {
+        // We assume that arithmetic instructions that use flags also def them.
+        assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+               "Expected a def of EFLAGS for this instruction!");
+
+        // NB!!! Several arithmetic instructions only *partially* update
+        // flags. Theoretically, we could generate MI code sequences that
+        // would rely on this fact and observe different flags independently.
+        // But currently LLVM models all of these instructions as clobbering
+        // all the flags in an undef way. We rely on that to simplify the
+        // logic.
+        FlagsKilled = true;
+
+        rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+        break;
+      }
+
+      // If this was the last use of the flags, we're done.
+      if (FlagsKilled)
+        break;
+    }
+
+    // If we didn't find a kill (or equivalent) check that the flags don't
+    // live-out of the basic block. Currently we don't support lowering copies
+    // of flags that live out in this fashion.
+    if (!FlagsKilled &&
+        llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) {
+          return SuccMBB->isLiveIn(X86::EFLAGS);
+        })) {
+      DEBUG({
+        dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n"
+               << "----\n";
+        MBB.dump();
+        dbgs() << "----\n"
+               << "ERROR: Cannot lower this EFLAGS copy!\n";
+      });
+      report_fatal_error(
+          "Cannot lower EFLAGS copy that lives out of a basic block!");
+    }
+
+    // Now rewrite the jumps that use the flags. These we handle specially
+    // because if there are multiple jumps we'll have to do surgery on the CFG.
+    for (MachineInstr *JmpI : JmpIs) {
+      // Past the first jump we need to split the blocks apart.
+      if (JmpI != JmpIs.front())
+        splitBlock(*JmpI->getParent(), *JmpI, *TII);
+
+      rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+    }
+
+    // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
+    // the copy's def operand is itself a kill.
+  }
+
+#ifndef NDEBUG
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == TargetOpcode::COPY &&
+          (MI.getOperand(0).getReg() == X86::EFLAGS ||
+           MI.getOperand(1).getReg() == X86::EFLAGS)) {
+        DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; MI.dump());
+        llvm_unreachable("Unlowered EFLAGS copy!");
+      }
+#endif
+
+  return true;
+}
+
+/// Collect any conditions that have already been set in registers so that we
+/// can re-use them rather than adding duplicates.
+CondRegArray
+X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
+                                             MachineInstr &CopyDefI) {
+  CondRegArray CondRegs = {};
+
+  // Scan backwards across the range of instructions with live EFLAGS.
+  for (MachineInstr &MI : llvm::reverse(
+           llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
+    X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
+    if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
+        TRI->isVirtualRegister(MI.getOperand(0).getReg()))
+      CondRegs[Cond] = MI.getOperand(0).getReg();
+
+    // Stop scanning when we see the first definition of the EFLAGS as prior to
+    // this we would potentially capture the wrong flag state.
+    if (MI.findRegisterDefOperand(X86::EFLAGS))
+      break;
+  }
+  return CondRegs;
+}
+
+unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, X86::CondCode Cond) {
+  unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+  auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
+                      TII->get(X86::getSETFromCond(Cond)), Reg);
+  (void)SetI;
+  DEBUG(dbgs() << "    save cond: "; SetI->dump());
+  ++NumSetCCsInserted;
+  return Reg;
+}
+
+std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) {
+  unsigned &CondReg = CondRegs[Cond];
+  unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)];
+  if (!CondReg && !InvCondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  if (CondReg)
+    return {CondReg, false};
+  else
+    return {InvCondReg, true};
+}
+
+void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator Pos,
+                                          DebugLoc Loc, unsigned Reg) {
+  // We emit test instructions as register/immediate test against -1. This
+  // allows register allocation to fold a memory operand if needed (that will
+  // happen often due to the places this code is emitted). But hopefully will
+  // also allow us to select a shorter encoding of `testb %reg, %reg` when that
+  // would be equivalent.
+  auto TestI =
+      BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8ri)).addReg(Reg).addImm(-1);
+  (void)TestI;
+  DEBUG(dbgs() << "    test cond: "; TestI->dump());
+  ++NumTestsInserted;
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
+    CondRegArray &CondRegs) {
+  // Arithmetic is either reading CF or OF. Figure out which condition we need
+  // to preserve in a register.
+  X86::CondCode Cond;
+
+  // The addend to use to reset CF or OF when added to the flag value.
+  int Addend;
+
+  switch (getMnemonicFromOpcode(MI.getOpcode())) {
+  case FlagArithMnemonic::ADC:
+  case FlagArithMnemonic::ADCX:
+  case FlagArithMnemonic::RCL:
+  case FlagArithMnemonic::RCR:
+  case FlagArithMnemonic::SBB:
+    Cond = X86::COND_B; // CF == 1
+    // Set up an addend that when one is added will need a carry due to not
+    // having a higher bit available.
+    Addend = 255;
+    break;
+
+  case FlagArithMnemonic::ADOX:
+    Cond = X86::COND_O; // OF == 1
+    // Set up an addend that when one is added will turn from positive to
+    // negative and thus overflow in the signed domain.
+    Addend = 127;
+    break;
+  }
+
+  // Now get a register that contains the value of the flag input to the
+  // arithmetic. We require exactly this flag to simplify the arithmetic
+  // required to materialize it back into the flag.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  // Insert an instruction that will set the flag back to the desired value.
+  unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+  auto AddI =
+      BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
+          .addDef(TmpReg, RegState::Dead)
+          .addReg(CondReg)
+          .addImm(Addend);
+  (void)AddI;
+  DEBUG(dbgs() << "    add cond: "; AddI->dump());
+  ++NumAddsInserted;
+  FlagUse.setIsKill(true);
+}
+
+void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
+                                           MachineBasicBlock::iterator TestPos,
+                                           DebugLoc TestLoc,
+                                           MachineInstr &CMovI,
+                                           MachineOperand &FlagUse,
+                                           CondRegArray &CondRegs) {
+  // First get the register containing this specific condition.
+  X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode());
+  unsigned CondReg;
+  bool Inverted;
+  std::tie(CondReg, Inverted) =
+      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+  MachineBasicBlock &MBB = *CMovI.getParent();
+
+  // Insert a direct test of the saved register.
+  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+  // Rewrite the CMov to use the !ZF flag from the test (but match register
+  // size and memory operand), and then kill its use of the flags afterward.
+  auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg());
+  CMovI.setDesc(TII->get(X86::getCMovFromCond(
+      Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8,
+      !CMovI.memoperands_empty())));
+  FlagUse.setIsKill(true);
+  DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCondJmp(
+    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
+    DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+  // First get the register containing this specific condition.
+  X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode());
+  unsigned CondReg;
+  bool Inverted;
+  std::tie(CondReg, Inverted) =
+      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+  MachineBasicBlock &JmpMBB = *JmpI.getParent();
+
+  // Insert a direct test of the saved register.
+  insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
+
+  // Rewrite the jump to use the !ZF flag from the test, and kill its use of
+  // flags afterward.
+  JmpI.setDesc(TII->get(
+      X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE)));
+  const int ImplicitEFLAGSOpIdx = 1;
+  JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true);
+  DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
+}
+
+void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
+                                           MachineOperand &FlagUse,
+                                           MachineInstr &CopyDefI) {
+  // Just replace this copy with the the original copy def.
+  MRI->replaceRegWith(MI.getOperand(0).getReg(),
+                      CopyDefI.getOperand(0).getReg());
+  MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
+                                            MachineBasicBlock::iterator TestPos,
+                                            DebugLoc TestLoc,
+                                            MachineInstr &SetCCI,
+                                            MachineOperand &FlagUse,
+                                            CondRegArray &CondRegs) {
+  X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode());
+  // Note that we can't usefully rewrite this to the inverse without complex
+  // analysis of the users of the setCC. Largely we rely on duplicates which
+  // could have been avoided already being avoided here.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
+
+  // Rewriting this is trivial: we just replace the register and remove the
+  // setcc.
+  MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg);
+  SetCCI.eraseFromParent();
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b718fb6ee5b..b119a1fb20a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38721,25 +38721,6 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   }
 }
 
-/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
-/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
-/// we don't adjust the stack we clobber the first frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  return any_of(MRI.reg_instructions(X86::EFLAGS),
-                [](const MachineInstr &RI) { return RI.isCopy(); });
-}
-
-void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
-  if (hasCopyImplyingStackAdjustment(MF)) {
-    MachineFrameInfo &MFI = MF.getFrameInfo();
-    MFI.setHasCopyImplyingStackAdjustment(true);
-  }
-
-  TargetLoweringBase::finalizeLowering(MF);
-}
-
 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
                                                   SDValue Value, SDValue Addr,
                                                   SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 2fdc0c22e39..517ac3a1123 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1121,8 +1121,6 @@ namespace llvm {
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
-    void finalizeLowering(MachineFunction &MF) const override;
-
     SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, 
                                    SDValue Addr, SelectionDAG &DAG) 
                                    const override;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 94a0bee7e8e..a39d424e69a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -6796,102 +6796,12 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  bool FromEFLAGS = SrcReg == X86::EFLAGS;
-  bool ToEFLAGS = DestReg == X86::EFLAGS;
-  int Reg = FromEFLAGS ? DestReg : SrcReg;
-  bool is32 = X86::GR32RegClass.contains(Reg);
-  bool is64 = X86::GR64RegClass.contains(Reg);
-
-  if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
-    int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
-    int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
-    int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
-    int Pop = is64 ? X86::POP64r : X86::POP32r;
-    int PopF = is64 ? X86::POPF64 : X86::POPF32;
-    int AX = is64 ? X86::RAX : X86::EAX;
-
-    if (!Subtarget.hasLAHFSAHF()) {
-      assert(Subtarget.is64Bit() &&
-             "Not having LAHF/SAHF only happens on 64-bit.");
-      // Moving EFLAGS to / from another register requires a push and a pop.
-      // Notice that we have to adjust the stack if we don't want to clobber the
-      // first frame index. See X86FrameLowering.cpp - usesTheStack.
-      if (FromEFLAGS) {
-        BuildMI(MBB, MI, DL, get(PushF));
-        BuildMI(MBB, MI, DL, get(Pop), DestReg);
-      }
-      if (ToEFLAGS) {
-        BuildMI(MBB, MI, DL, get(Push))
-            .addReg(SrcReg, getKillRegState(KillSrc));
-        BuildMI(MBB, MI, DL, get(PopF));
-      }
-      return;
-    }
-
-    // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
-    // inefficient. Instead:
-    //   - Save the overflow flag OF into AL using SETO, and restore it using a
-    //     signed 8-bit addition of AL and INT8_MAX.
-    //   - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
-    //     using LAHF/SAHF.
-    //   - When RAX/EAX is live and isn't the destination register, make sure it
-    //     isn't clobbered by PUSH/POP'ing it before and after saving/restoring
-    //     the flags.
-    // This approach is ~2.25x faster than using PUSHF/POPF.
-    //
-    // This is still somewhat inefficient because we don't know which flags are
-    // actually live inside EFLAGS. Were we able to do a single SETcc instead of
-    // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
-    //
-    // PUSHF/POPF is also potentially incorrect because it affects other flags
-    // such as TF/IF/DF, which LLVM doesn't model.
-    //
-    // Notice that we have to adjust the stack if we don't want to clobber the
-    // first frame index.
-    // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
-
-    const TargetRegisterInfo &TRI = getRegisterInfo();
-    MachineBasicBlock::LivenessQueryResult LQR =
-        MBB.computeRegisterLiveness(&TRI, AX, MI);
-    // We do not want to save and restore AX if we do not have to.
-    // Moreover, if we do so whereas AX is dead, we would need to set
-    // an undef flag on the use of AX, otherwise the verifier will
-    // complain that we read an undef value.
-    // We do not want to change the behavior of the machine verifier
-    // as this is usually wrong to read an undef value.
-    if (MachineBasicBlock::LQR_Unknown == LQR) {
-      LivePhysRegs LPR(TRI);
-      LPR.addLiveOuts(MBB);
-      MachineBasicBlock::iterator I = MBB.end();
-      while (I != MI) {
-        --I;
-        LPR.stepBackward(*I);
-      }
-      // AX contains the top most register in the aliasing hierarchy.
-      // It may not be live, but one of its aliases may be.
-      for (MCRegAliasIterator AI(AX, &TRI, true);
-           AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
-        LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
-                                : MachineBasicBlock::LQR_Dead;
-    }
-    bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
-    if (!AXDead)
-      BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
-    if (FromEFLAGS) {
-      BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
-      BuildMI(MBB, MI, DL, get(X86::LAHF));
-      BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
-    }
-    if (ToEFLAGS) {
-      BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
-      BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
-          .addReg(X86::AL)
-          .addImm(INT8_MAX);
-      BuildMI(MBB, MI, DL, get(X86::SAHF));
-    }
-    if (!AXDead)
-      BuildMI(MBB, MI, DL, get(Pop), AX);
-    return;
+  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
+    // FIXME: We use a fatal error here because historically LLVM has tried
+    // lower some of these physreg copies and we want to ensure we get
+    // reasonable bug reports if someone encounters a case no other testing
+    // found. This path should be removed after the LLVM 7 release.
+    report_fatal_error("Unable to copy EFLAGS physical register!");
   }
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index fe901fc2dd1..87dadd9966c 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -64,6 +64,7 @@ void initializeX86CmovConverterPassPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -84,6 +85,7 @@ extern "C" void LLVMInitializeX86Target() {
   initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
   initializeX86AvoidSFBPassPass(PR);
+  initializeX86FlagsCopyLoweringPassPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -456,6 +458,7 @@ void X86PassConfig::addPreRegAlloc() {
     addPass(createX86AvoidStoreForwardingBlocks());
   }
 
+  addPass(createX86FlagsCopyLoweringPass());
   addPass(createX86WinAllocaExpander());
 }
 void X86PassConfig::addMachineSSAOptimization() {