diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/CodeGen/PrologEpilogInserter.cpp | 20 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/CMakeLists.txt | 107 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86CallFrameOptimization.cpp | 400 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86FastISel.cpp | 6716 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86FrameLowering.cpp | 4123 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86FrameLowering.h | 192 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrCompiler.td | 3700 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 52 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86MachineFunctionInfo.h | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86RegisterInfo.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 5 | 
14 files changed, 7878 insertions, 7469 deletions
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 385e5a35afb..61407faaf32 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {  /// register references and actual offsets.  ///  void PEI::replaceFrameIndices(MachineFunction &Fn) { -  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? +  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); +  if (!TFI.needsFrameIndexResolution(Fn)) return;    // Store SPAdj at exit of a basic block.    SmallVector<int, 8> SPState; @@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,        continue;      } -    // If we are looking at a call sequence, we need to keep track of -    // the SP adjustment made by each instruction in the sequence. -    // This includes both the frame setup/destroy pseudos (handled above), -    // as well as other instructions that have side effects w.r.t the SP. -    if (InsideCallSequence) -      SPAdj += TII.getSPAdjust(I); -      MachineInstr *MI = I;      bool DoIncr = true;      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { @@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,        break;      } +    // If we are looking at a call sequence, we need to keep track of +    // the SP adjustment made by each instruction in the sequence. +    // This includes both the frame setup/destroy pseudos (handled above), +    // as well as other instructions that have side effects w.r.t the SP. +    // Note that this must come after eliminateFrameIndex, because  +    // if I itself referred to a frame index, we shouldn't count its own +    // adjustment. +    if (MI && InsideCallSequence) +      SPAdj += TII.getSPAdjust(MI); +      if (DoIncr && I != BB->end()) ++I;      // Update register states. diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index 1557d10238e..e3f01912b87 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,    FrameReg = RI->getFrameRegister(MF);    return getFrameIndexOffset(MF, FI);  } + +bool TargetFrameLowering::needsFrameIndexResolution( +    const MachineFunction &MF) const { +  return MF.getFrameInfo()->hasStackObjects(); +} diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 1083fad80e8..461915f3414 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -1,53 +1,54 @@ -set(LLVM_TARGET_DEFINITIONS X86.td) - -tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info) -tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler) -tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info) -tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) -tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) -tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) -tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) -tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) -add_public_tablegen_target(X86CommonTableGen) - -set(sources -  X86AsmPrinter.cpp -  X86FastISel.cpp -  X86FloatingPoint.cpp -  X86FrameLowering.cpp -  X86ISelDAGToDAG.cpp -  X86ISelLowering.cpp -  X86InstrInfo.cpp -  X86MCInstLower.cpp -  X86MachineFunctionInfo.cpp -  X86PadShortFunction.cpp -  X86RegisterInfo.cpp -  X86SelectionDAGInfo.cpp -  X86Subtarget.cpp -  X86TargetMachine.cpp -  X86TargetObjectFile.cpp -  X86TargetTransformInfo.cpp -  X86VZeroUpper.cpp -  X86FixupLEAs.cpp -  ) - -if( CMAKE_CL_64 ) -  enable_language(ASM_MASM) -  ADD_CUSTOM_COMMAND( -    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj -    MAIN_DEPENDENCY X86CompilationCallback_Win64.asm -    COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm -   ) -   set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) -endif() - -add_llvm_target(X86CodeGen ${sources}) - -add_subdirectory(AsmParser) -add_subdirectory(Disassembler) -add_subdirectory(InstPrinter) -add_subdirectory(MCTargetDesc) -add_subdirectory(TargetInfo) -add_subdirectory(Utils) +set(LLVM_TARGET_DEFINITIONS X86.td)
 +
 +tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
 +tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
 +tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
 +tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
 +tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
 +tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
 +tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
 +tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
 +tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
 +tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
 +add_public_tablegen_target(X86CommonTableGen)
 +
 +set(sources
 +  X86AsmPrinter.cpp
 +  X86CallFrameOptimization.cpp
 +  X86FastISel.cpp
 +  X86FloatingPoint.cpp
 +  X86FrameLowering.cpp
 +  X86ISelDAGToDAG.cpp
 +  X86ISelLowering.cpp
 +  X86InstrInfo.cpp
 +  X86MCInstLower.cpp
 +  X86MachineFunctionInfo.cpp
 +  X86PadShortFunction.cpp
 +  X86RegisterInfo.cpp
 +  X86SelectionDAGInfo.cpp
 +  X86Subtarget.cpp
 +  X86TargetMachine.cpp
 +  X86TargetObjectFile.cpp
 +  X86TargetTransformInfo.cpp
 +  X86VZeroUpper.cpp
 +  X86FixupLEAs.cpp
 +  )
 +
 +if( CMAKE_CL_64 )
 +  enable_language(ASM_MASM)
 +  ADD_CUSTOM_COMMAND(
 +    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
 +    MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
 +    COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
 +   )
 +   set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
 +endif()
 +
 +add_llvm_target(X86CodeGen ${sources})
 +
 +add_subdirectory(AsmParser)
 +add_subdirectory(Disassembler)
 +add_subdirectory(InstPrinter)
 +add_subdirectory(MCTargetDesc)
 +add_subdirectory(TargetInfo)
 +add_subdirectory(Utils)
 diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 71fc567cb55..8b0a4cf477f 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -64,6 +64,11 @@ FunctionPass *createX86PadShortFunctions();  /// to eliminate execution delays in some Atom processors.  FunctionPass *createX86FixupLEAs(); +/// createX86CallFrameOptimization - Return a pass that optimizes +/// the code-size of x86 call sequences. This is done by replacing +/// esp-relative movs with pushes. +FunctionPass *createX86CallFrameOptimization(); +  } // End llvm namespace  #endif diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp new file mode 100644 index 00000000000..f832b94fdc6 --- /dev/null +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -0,0 +1,400 @@ +//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file defines a pass that optimizes call sequences on x86.
 +// Currently, it converts movs of function parameters onto the stack into 
 +// pushes. This is beneficial for two main reasons:
 +// 1) The push instruction encoding is much smaller than an esp-relative mov
 +// 2) It is possible to push memory arguments directly. So, if the
 +//    the transformation is preformed pre-reg-alloc, it can help relieve
 +//    register pressure.
 +//
 +//===----------------------------------------------------------------------===//
 +
 +#include <algorithm>
 +
 +#include "X86.h"
 +#include "X86InstrInfo.h"
 +#include "X86Subtarget.h"
 +#include "X86MachineFunctionInfo.h"
 +#include "llvm/ADT/Statistic.h"
 +#include "llvm/CodeGen/MachineFunctionPass.h"
 +#include "llvm/CodeGen/MachineInstrBuilder.h"
 +#include "llvm/CodeGen/MachineRegisterInfo.h"
 +#include "llvm/CodeGen/Passes.h"
 +#include "llvm/IR/Function.h"
 +#include "llvm/Support/Debug.h"
 +#include "llvm/Support/raw_ostream.h"
 +#include "llvm/Target/TargetInstrInfo.h"
 +
 +using namespace llvm;
 +
 +#define DEBUG_TYPE "x86-cf-opt"
 +
 +cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
 +              cl::desc("Avoid optimizing x86 call frames for size"),
 +              cl::init(false), cl::Hidden);
 +
 +namespace {
 +class X86CallFrameOptimization : public MachineFunctionPass {
 +public:
 +  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
 +
 +  bool runOnMachineFunction(MachineFunction &MF) override;
 +
 +private:
 +  bool shouldPerformTransformation(MachineFunction &MF);
 +
 +  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
 +                          MachineBasicBlock::iterator I);
 +
 +  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
 +                                   unsigned Reg);
 +
 +  const char *getPassName() const override {
 +    return "X86 Optimize Call Frame";
 +  }
 +
 +  const TargetInstrInfo *TII;
 +  const TargetFrameLowering *TFL;
 +  const MachineRegisterInfo *MRI;
 +  static char ID;
 +};
 +
 +char X86CallFrameOptimization::ID = 0;
 +}
 +
 +FunctionPass *llvm::createX86CallFrameOptimization() {
 +  return new X86CallFrameOptimization();
 +}
 +
 +// This checks whether the transformation is legal and profitable
 +bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
 +  if (NoX86CFOpt.getValue())
 +    return false;
 +
 +  // We currently only support call sequences where *all* parameters.
 +  // are passed on the stack.
 +  // No point in running this in 64-bit mode, since some arguments are
 +  // passed in-register in all common calling conventions, so the pattern
 +  // we're looking for will never match.
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  if (STI.is64Bit())
 +    return false;
 +
 +  // You would expect straight-line code between call-frame setup and
 +  // call-frame destroy. You would be wrong. There are circumstances (e.g.
 +  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
 +  // end up with the setup and the destroy in different basic blocks.
 +  // This is bad, and breaks SP adjustment.
 +  // So, check that all of the frames in the function are closed inside
 +  // the same block, and, for good measure, that there are no nested frames.
 +  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
 +  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 +  for (MachineBasicBlock &BB : MF) {
 +    bool InsideFrameSequence = false;
 +    for (MachineInstr &MI : BB) {
 +      if (MI.getOpcode() == FrameSetupOpcode) {
 +        if (InsideFrameSequence)
 +          return false;
 +        InsideFrameSequence = true;
 +      }
 +      else if (MI.getOpcode() == FrameDestroyOpcode) {
 +        if (!InsideFrameSequence)
 +          return false;
 +        InsideFrameSequence = false;
 +      }
 +    }
 +
 +    if (InsideFrameSequence)
 +      return false;
 +  }
 +
 +  // Now that we know the transformation is legal, check if it is
 +  // profitable.
 +  // TODO: Add a heuristic that actually looks at the function,
 +  //       and enable this for more cases.
 +
 +  // This transformation is always a win when we expected to have
 +  // a reserved call frame. Under other circumstances, it may be either 
 +  // a win or a loss, and requires a heuristic.
 +  // For now, enable it only for the relatively clear win cases.
 +  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
 +  if (CannotReserveFrame)
 +    return true;
 +
 +  // For now, don't even try to evaluate the profitability when
 +  // not optimizing for size.
 +  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
 +  bool OptForSize =
 +    FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
 +    Attribute::OptimizeForSize) ||
 +    FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
 +
 +  if (!OptForSize)
 +    return false;
 +
 +  // Stack re-alignment can make this unprofitable even in terms of size.
 +  // As mentioned above, a better heuristic is needed. For now, don't do this
 +  // when the required alignment is above 8. (4 would be the safe choice, but
 +  // some experimentation showed 8 is generally good).
 +  if (TFL->getStackAlignment() > 8)
 +    return false;
 +
 +  return true;
 +}
 +
 +bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
 +  TII = MF.getSubtarget().getInstrInfo();
 +  TFL = MF.getSubtarget().getFrameLowering();
 +  MRI = &MF.getRegInfo();
 +
 +  if (!shouldPerformTransformation(MF))
 +    return false;
 +
 +  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
 +
 +  bool Changed = false;
 +
 +  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
 +    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
 +      if (I->getOpcode() == FrameSetupOpcode)
 +        Changed |= adjustCallSequence(MF, *BB, I);
 +
 +  return Changed;
 +}
 +
 +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 +                                                MachineBasicBlock &MBB,
 +                                                MachineBasicBlock::iterator I) {
 +
 +  // Check that this particular call sequence is amenable to the
 +  // transformation.
 +  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
 +                                       MF.getSubtarget().getRegisterInfo());
 +  unsigned StackPtr = RegInfo.getStackRegister();
 +  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 +
 +  // We expect to enter this at the beginning of a call sequence
 +  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
 +  MachineBasicBlock::iterator FrameSetup = I++;
 +
 +  
 +  // For globals in PIC mode, we can have some LEAs here.
 +  // Ignore them, they don't bother us.
 +  // TODO: Extend this to something that covers more cases.
 +  while (I->getOpcode() == X86::LEA32r)
 +    ++I;
 +  
 +  // We expect a copy instruction here.
 +  // TODO: The copy instruction is a lowering artifact.
 +  //       We should also support a copy-less version, where the stack
 +  //       pointer is used directly.
 +  if (!I->isCopy() || !I->getOperand(0).isReg())
 +    return false;
 +  MachineBasicBlock::iterator SPCopy = I++;
 +  StackPtr = SPCopy->getOperand(0).getReg();
 +
 +  // Scan the call setup sequence for the pattern we're looking for.
 +  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
 +  // instructions, that push a sequence of 32-bit values onto the stack, with
 +  // no gaps between them.
 +  SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
 +  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
 +  if (MaxAdjust > 4)
 +    MovVector.resize(MaxAdjust, nullptr);
 +
 +  do {
 +    int Opcode = I->getOpcode();
 +    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
 +      break;
 +
 +    // We only want movs of the form:
 +    // movl imm/r32, k(%esp)
 +    // If we run into something else, bail.
 +    // Note that AddrBaseReg may, counter to its name, not be a register,
 +    // but rather a frame index.
 +    // TODO: Support the fi case. This should probably work now that we
 +    // have the infrastructure to track the stack pointer within a call
 +    // sequence.
 +    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
 +        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
 +        !I->getOperand(X86::AddrScaleAmt).isImm() ||
 +        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
 +        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
 +        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
 +        !I->getOperand(X86::AddrDisp).isImm())
 +      return false;
 +
 +    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
 +    assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
 +
 +    // We really don't want to consider the unaligned case.
 +    if (StackDisp % 4)
 +      return false;
 +    StackDisp /= 4;
 +
 +    assert((size_t)StackDisp < MovVector.size() &&
 +      "Function call has more parameters than the stack is adjusted for.");
 +
 +    // If the same stack slot is being filled twice, something's fishy.
 +    if (MovVector[StackDisp] != nullptr)
 +      return false;
 +    MovVector[StackDisp] = I;
 +
 +    ++I;
 +  } while (I != MBB.end());
 +
 +  // We now expect the end of the sequence - a call and a stack adjust.
 +  if (I == MBB.end())
 +    return false;
 +
 +  // For PCrel calls, we expect an additional COPY of the basereg.
 +  // If we find one, skip it.
 +  if (I->isCopy()) {
 +    if (I->getOperand(1).getReg() ==
 +      MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
 +      ++I;
 +    else
 +      return false;
 +  }
 +
 +  if (!I->isCall())
 +    return false;
 +  MachineBasicBlock::iterator Call = I;
 +  if ((++I)->getOpcode() != FrameDestroyOpcode)
 +    return false;
 +
 +  // Now, go through the vector, and see that we don't have any gaps,
 +  // but only a series of 32-bit MOVs.
 +  
 +  int64_t ExpectedDist = 0;
 +  auto MMI = MovVector.begin(), MME = MovVector.end();
 +  for (; MMI != MME; ++MMI, ExpectedDist += 4)
 +    if (*MMI == nullptr)
 +      break;
 +  
 +  // If the call had no parameters, do nothing
 +  if (!ExpectedDist)
 +    return false;
 +
 +  // We are either at the last parameter, or a gap. 
 +  // Make sure it's not a gap
 +  for (; MMI != MME; ++MMI)
 +    if (*MMI != nullptr)
 +      return false;
 +
 +  // Ok, we can in fact do the transformation for this call.
 +  // Do not remove the FrameSetup instruction, but adjust the parameters.
 +  // PEI will end up finalizing the handling of this.
 +  FrameSetup->getOperand(1).setImm(ExpectedDist);
 +
 +  DebugLoc DL = I->getDebugLoc();
 +  // Now, iterate through the vector in reverse order, and replace the movs
 +  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 
 +  // replace uses.
 +  for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
 +    MachineBasicBlock::iterator MOV = *MovVector[Idx];
 +    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
 +    if (MOV->getOpcode() == X86::MOV32mi) {
 +      unsigned PushOpcode = X86::PUSHi32;
 +      // If the operand is a small (8-bit) immediate, we can use a
 +      // PUSH instruction with a shorter encoding.
 +      // Note that isImm() may fail even though this is a MOVmi, because
 +      // the operand can also be a symbol.
 +      if (PushOp.isImm()) {
 +        int64_t Val = PushOp.getImm();
 +        if (isInt<8>(Val))
 +          PushOpcode = X86::PUSH32i8;
 +      }
 +      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
 +    } else {
 +      unsigned int Reg = PushOp.getReg();
 +
 +      // If PUSHrmm is not slow on this target, try to fold the source of the
 +      // push into the instruction.
 +      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
 +      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
 +
 +      // Check that this is legal to fold. Right now, we're extremely
 +      // conservative about that.
 +      MachineInstr *DefMov = nullptr;
 +      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
 +        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
 +
 +        unsigned NumOps = DefMov->getDesc().getNumOperands();
 +        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
 +          Push->addOperand(DefMov->getOperand(i));
 +
 +        DefMov->eraseFromParent();
 +      } else {
 +        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
 +      }
 +    }
 +
 +    MBB.erase(MOV);
 +  }
 +
 +  // The stack-pointer copy is no longer used in the call sequences.
 +  // There should not be any other users, but we can't commit to that, so:
 +  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
 +    SPCopy->eraseFromParent();
 +
 +  // Once we've done this, we need to make sure PEI doesn't assume a reserved
 +  // frame.
 +  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 +  FuncInfo->setHasPushSequences(true);
 +
 +  return true;
 +}
 +
 +MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
 +    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
 +  // Do an extremely restricted form of load folding.
 +  // ISel will often create patterns like:
 +  // movl    4(%edi), %eax
 +  // movl    8(%edi), %ecx
 +  // movl    12(%edi), %edx
 +  // movl    %edx, 8(%esp)
 +  // movl    %ecx, 4(%esp)
 +  // movl    %eax, (%esp)
 +  // call
 +  // Get rid of those with prejudice.
 +  if (!TargetRegisterInfo::isVirtualRegister(Reg))
 +    return nullptr;
 +
 +  // Make sure this is the only use of Reg.
 +  if (!MRI->hasOneNonDBGUse(Reg))
 +    return nullptr;
 +
 +  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
 +
 +  // Make sure the def is a MOV from memory.
 +  // If the def is an another block, give up.
 +  if (DefMI->getOpcode() != X86::MOV32rm ||
 +      DefMI->getParent() != FrameSetup->getParent())
 +    return nullptr;
 +
 +  // Be careful with movs that load from a stack slot, since it may get
 +  // resolved incorrectly.
 +  // TODO: Again, we already have the infrastructure, so this should work.
 +  if (!DefMI->getOperand(1).isReg())
 +    return nullptr;
 +
 +  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
 +  // of MOVs. To be less conservative would require duplicating a lot of the
 +  // logic from PeepholeOptimizer.
 +  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
 +  // to be smarter about folding into pushes. 
 +  for (auto I = DefMI; I != FrameSetup; ++I)
 +    if (I->getOpcode() != X86::MOV32rm)
 +      return nullptr;
 +
 +  return DefMI;
 +}
 diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 227cacd24eb..220ba312197 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -1,3358 +1,3358 @@ -//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the X86-specific support for the FastISel class. Much -// of the target-specific code is generated by tablegen in the file -// X86GenFastISel.inc, which is #included here. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86CallingConv.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/FastISel.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/CallSite.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Operator.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetOptions.h" -using namespace llvm; - -namespace { - -class X86FastISel final : public FastISel { -  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can -  /// make the right decision when generating code for different targets. -  const X86Subtarget *Subtarget; - -  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 -  /// floating point ops. -  /// When SSE is available, use it for f32 operations. -  /// When SSE2 is available, use it for f64 operations. -  bool X86ScalarSSEf64; -  bool X86ScalarSSEf32; - -public: -  explicit X86FastISel(FunctionLoweringInfo &funcInfo, -                       const TargetLibraryInfo *libInfo) -    : FastISel(funcInfo, libInfo) { -    Subtarget = &TM.getSubtarget<X86Subtarget>(); -    X86ScalarSSEf64 = Subtarget->hasSSE2(); -    X86ScalarSSEf32 = Subtarget->hasSSE1(); -  } - -  bool fastSelectInstruction(const Instruction *I) override; - -  /// \brief The specified machine instr operand is a vreg, and that -  /// vreg is being provided by the specified load instruction.  If possible, -  /// try to fold the load as an operand to the instruction, returning true if -  /// possible. -  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, -                           const LoadInst *LI) override; - -  bool fastLowerArguments() override; -  bool fastLowerCall(CallLoweringInfo &CLI) override; -  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; - -#include "X86GenFastISel.inc" - -private: -  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); - -  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, -                       unsigned &ResultReg); - -  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, -                        MachineMemOperand *MMO = nullptr, bool Aligned = false); -  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, -                        const X86AddressMode &AM, -                        MachineMemOperand *MMO = nullptr, bool Aligned = false); - -  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, -                         unsigned &ResultReg); - -  bool X86SelectAddress(const Value *V, X86AddressMode &AM); -  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); - -  bool X86SelectLoad(const Instruction *I); - -  bool X86SelectStore(const Instruction *I); - -  bool X86SelectRet(const Instruction *I); - -  bool X86SelectCmp(const Instruction *I); - -  bool X86SelectZExt(const Instruction *I); - -  bool X86SelectBranch(const Instruction *I); - -  bool X86SelectShift(const Instruction *I); - -  bool X86SelectDivRem(const Instruction *I); - -  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); - -  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); - -  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); - -  bool X86SelectSelect(const Instruction *I); - -  bool X86SelectTrunc(const Instruction *I); - -  bool X86SelectFPExt(const Instruction *I); -  bool X86SelectFPTrunc(const Instruction *I); - -  const X86InstrInfo *getInstrInfo() const { -    return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); -  } -  const X86TargetMachine *getTargetMachine() const { -    return static_cast<const X86TargetMachine *>(&TM); -  } - -  bool handleConstantAddresses(const Value *V, X86AddressMode &AM); - -  unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); -  unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); -  unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); -  unsigned fastMaterializeConstant(const Constant *C) override; - -  unsigned fastMaterializeAlloca(const AllocaInst *C) override; - -  unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; - -  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is -  /// computed in an SSE register, not on the X87 floating point stack. -  bool isScalarFPTypeInSSEReg(EVT VT) const { -    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 -      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1 -  } - -  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); - -  bool IsMemcpySmall(uint64_t Len); - -  bool TryEmitSmallMemcpy(X86AddressMode DestAM, -                          X86AddressMode SrcAM, uint64_t Len); - -  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, -                            const Value *Cond); -}; - -} // end anonymous namespace. - -static std::pair<X86::CondCode, bool> -getX86ConditionCode(CmpInst::Predicate Predicate) { -  X86::CondCode CC = X86::COND_INVALID; -  bool NeedSwap = false; -  switch (Predicate) { -  default: break; -  // Floating-point Predicates -  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break; -  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through -  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break; -  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through -  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break; -  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through -  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break; -  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through -  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break; -  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break; -  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break; -  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break; -  case CmpInst::FCMP_OEQ: // fall-through -  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; - -  // Integer Predicates -  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break; -  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break; -  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break; -  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break; -  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break; -  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break; -  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break; -  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break; -  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break; -  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break; -  } - -  return std::make_pair(CC, NeedSwap); -} - -static std::pair<unsigned, bool> -getX86SSEConditionCode(CmpInst::Predicate Predicate) { -  unsigned CC; -  bool NeedSwap = false; - -  // SSE Condition code mapping: -  //  0 - EQ -  //  1 - LT -  //  2 - LE -  //  3 - UNORD -  //  4 - NEQ -  //  5 - NLT -  //  6 - NLE -  //  7 - ORD -  switch (Predicate) { -  default: llvm_unreachable("Unexpected predicate"); -  case CmpInst::FCMP_OEQ: CC = 0;          break; -  case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through -  case CmpInst::FCMP_OLT: CC = 1;          break; -  case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through -  case CmpInst::FCMP_OLE: CC = 2;          break; -  case CmpInst::FCMP_UNO: CC = 3;          break; -  case CmpInst::FCMP_UNE: CC = 4;          break; -  case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through -  case CmpInst::FCMP_UGE: CC = 5;          break; -  case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through -  case CmpInst::FCMP_UGT: CC = 6;          break; -  case CmpInst::FCMP_ORD: CC = 7;          break; -  case CmpInst::FCMP_UEQ: -  case CmpInst::FCMP_ONE: CC = 8;          break; -  } - -  return std::make_pair(CC, NeedSwap); -} - -/// \brief Check if it is possible to fold the condition from the XALU intrinsic -/// into the user. The condition code will only be updated on success. -bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, -                                       const Value *Cond) { -  if (!isa<ExtractValueInst>(Cond)) -    return false; - -  const auto *EV = cast<ExtractValueInst>(Cond); -  if (!isa<IntrinsicInst>(EV->getAggregateOperand())) -    return false; - -  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); -  MVT RetVT; -  const Function *Callee = II->getCalledFunction(); -  Type *RetTy = -    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); -  if (!isTypeLegal(RetTy, RetVT)) -    return false; - -  if (RetVT != MVT::i32 && RetVT != MVT::i64) -    return false; - -  X86::CondCode TmpCC; -  switch (II->getIntrinsicID()) { -  default: return false; -  case Intrinsic::sadd_with_overflow: -  case Intrinsic::ssub_with_overflow: -  case Intrinsic::smul_with_overflow: -  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; -  case Intrinsic::uadd_with_overflow: -  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; -  } - -  // Check if both instructions are in the same basic block. -  if (II->getParent() != I->getParent()) -    return false; - -  // Make sure nothing is in the way -  BasicBlock::const_iterator Start = I; -  BasicBlock::const_iterator End = II; -  for (auto Itr = std::prev(Start); Itr != End; --Itr) { -    // We only expect extractvalue instructions between the intrinsic and the -    // instruction to be selected. -    if (!isa<ExtractValueInst>(Itr)) -      return false; - -    // Check that the extractvalue operand comes from the intrinsic. -    const auto *EVI = cast<ExtractValueInst>(Itr); -    if (EVI->getAggregateOperand() != II) -      return false; -  } - -  CC = TmpCC; -  return true; -} - -bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { -  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); -  if (evt == MVT::Other || !evt.isSimple()) -    // Unhandled type. Halt "fast" selection and bail. -    return false; - -  VT = evt.getSimpleVT(); -  // For now, require SSE/SSE2 for performing floating-point operations, -  // since x87 requires additional work. -  if (VT == MVT::f64 && !X86ScalarSSEf64) -    return false; -  if (VT == MVT::f32 && !X86ScalarSSEf32) -    return false; -  // Similarly, no f80 support yet. -  if (VT == MVT::f80) -    return false; -  // We only handle legal types. For example, on x86-32 the instruction -  // selector contains all of the 64-bit instructions from x86-64, -  // under the assumption that i64 won't be used if the target doesn't -  // support it. -  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); -} - -#include "X86GenCallingConv.inc" - -/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. -/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. -/// Return true and the result register by reference if it is possible. -bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, -                                  MachineMemOperand *MMO, unsigned &ResultReg) { -  // Get opcode and regclass of the output for the given load instruction. -  unsigned Opc = 0; -  const TargetRegisterClass *RC = nullptr; -  switch (VT.getSimpleVT().SimpleTy) { -  default: return false; -  case MVT::i1: -  case MVT::i8: -    Opc = X86::MOV8rm; -    RC  = &X86::GR8RegClass; -    break; -  case MVT::i16: -    Opc = X86::MOV16rm; -    RC  = &X86::GR16RegClass; -    break; -  case MVT::i32: -    Opc = X86::MOV32rm; -    RC  = &X86::GR32RegClass; -    break; -  case MVT::i64: -    // Must be in x86-64 mode. -    Opc = X86::MOV64rm; -    RC  = &X86::GR64RegClass; -    break; -  case MVT::f32: -    if (X86ScalarSSEf32) { -      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; -      RC  = &X86::FR32RegClass; -    } else { -      Opc = X86::LD_Fp32m; -      RC  = &X86::RFP32RegClass; -    } -    break; -  case MVT::f64: -    if (X86ScalarSSEf64) { -      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; -      RC  = &X86::FR64RegClass; -    } else { -      Opc = X86::LD_Fp64m; -      RC  = &X86::RFP64RegClass; -    } -    break; -  case MVT::f80: -    // No f80 support yet. -    return false; -  } - -  ResultReg = createResultReg(RC); -  MachineInstrBuilder MIB = -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); -  addFullAddress(MIB, AM); -  if (MMO) -    MIB->addMemOperand(*FuncInfo.MF, MMO); -  return true; -} - -/// X86FastEmitStore - Emit a machine instruction to store a value Val of -/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr -/// and a displacement offset, or a GlobalAddress, -/// i.e. V. Return true if it is possible. -bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, -                                   const X86AddressMode &AM, -                                   MachineMemOperand *MMO, bool Aligned) { -  // Get opcode and regclass of the output for the given store instruction. -  unsigned Opc = 0; -  switch (VT.getSimpleVT().SimpleTy) { -  case MVT::f80: // No f80 support yet. -  default: return false; -  case MVT::i1: { -    // Mask out all but lowest bit. -    unsigned AndResult = createResultReg(&X86::GR8RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(X86::AND8ri), AndResult) -      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); -    ValReg = AndResult; -  } -  // FALLTHROUGH, handling i1 as i8. -  case MVT::i8:  Opc = X86::MOV8mr;  break; -  case MVT::i16: Opc = X86::MOV16mr; break; -  case MVT::i32: Opc = X86::MOV32mr; break; -  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. -  case MVT::f32: -    Opc = X86ScalarSSEf32 ? -          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; -    break; -  case MVT::f64: -    Opc = X86ScalarSSEf64 ? -          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; -    break; -  case MVT::v4f32: -    if (Aligned) -      Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; -    else -      Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; -    break; -  case MVT::v2f64: -    if (Aligned) -      Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; -    else -      Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; -    break; -  case MVT::v4i32: -  case MVT::v2i64: -  case MVT::v8i16: -  case MVT::v16i8: -    if (Aligned) -      Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; -    else -      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; -    break; -  } - -  MachineInstrBuilder MIB = -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); -  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); -  if (MMO) -    MIB->addMemOperand(*FuncInfo.MF, MMO); - -  return true; -} - -bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, -                                   const X86AddressMode &AM, -                                   MachineMemOperand *MMO, bool Aligned) { -  // Handle 'null' like i32/i64 0. -  if (isa<ConstantPointerNull>(Val)) -    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); - -  // If this is a store of a simple constant, fold the constant into the store. -  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { -    unsigned Opc = 0; -    bool Signed = true; -    switch (VT.getSimpleVT().SimpleTy) { -    default: break; -    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8. -    case MVT::i8:  Opc = X86::MOV8mi;  break; -    case MVT::i16: Opc = X86::MOV16mi; break; -    case MVT::i32: Opc = X86::MOV32mi; break; -    case MVT::i64: -      // Must be a 32-bit sign extended value. -      if (isInt<32>(CI->getSExtValue())) -        Opc = X86::MOV64mi32; -      break; -    } - -    if (Opc) { -      MachineInstrBuilder MIB = -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); -      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() -                                            : CI->getZExtValue()); -      if (MMO) -        MIB->addMemOperand(*FuncInfo.MF, MMO); -      return true; -    } -  } - -  unsigned ValReg = getRegForValue(Val); -  if (ValReg == 0) -    return false; - -  bool ValKill = hasTrivialKill(Val); -  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); -} - -/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of -/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. -/// ISD::SIGN_EXTEND). -bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, -                                    unsigned Src, EVT SrcVT, -                                    unsigned &ResultReg) { -  unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, -                           Src, /*TODO: Kill=*/false); -  if (RR == 0) -    return false; - -  ResultReg = RR; -  return true; -} - -bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { -  // Handle constant address. -  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { -    // Can't handle alternate code models yet. -    if (TM.getCodeModel() != CodeModel::Small) -      return false; - -    // Can't handle TLS yet. -    if (GV->isThreadLocal()) -      return false; - -    // RIP-relative addresses can't have additional register operands, so if -    // we've already folded stuff into the addressing mode, just force the -    // global value into its own register, which we can use as the basereg. -    if (!Subtarget->isPICStyleRIPRel() || -        (AM.Base.Reg == 0 && AM.IndexReg == 0)) { -      // Okay, we've committed to selecting this global. Set up the address. -      AM.GV = GV; - -      // Allow the subtarget to classify the global. -      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); - -      // If this reference is relative to the pic base, set it now. -      if (isGlobalRelativeToPICBase(GVFlags)) { -        // FIXME: How do we know Base.Reg is free?? -        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); -      } - -      // Unless the ABI requires an extra load, return a direct reference to -      // the global. -      if (!isGlobalStubReference(GVFlags)) { -        if (Subtarget->isPICStyleRIPRel()) { -          // Use rip-relative addressing if we can.  Above we verified that the -          // base and index registers are unused. -          assert(AM.Base.Reg == 0 && AM.IndexReg == 0); -          AM.Base.Reg = X86::RIP; -        } -        AM.GVOpFlags = GVFlags; -        return true; -      } - -      // Ok, we need to do a load from a stub.  If we've already loaded from -      // this stub, reuse the loaded pointer, otherwise emit the load now. -      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); -      unsigned LoadReg; -      if (I != LocalValueMap.end() && I->second != 0) { -        LoadReg = I->second; -      } else { -        // Issue load from stub. -        unsigned Opc = 0; -        const TargetRegisterClass *RC = nullptr; -        X86AddressMode StubAM; -        StubAM.Base.Reg = AM.Base.Reg; -        StubAM.GV = GV; -        StubAM.GVOpFlags = GVFlags; - -        // Prepare for inserting code in the local-value area. -        SavePoint SaveInsertPt = enterLocalValueArea(); - -        if (TLI.getPointerTy() == MVT::i64) { -          Opc = X86::MOV64rm; -          RC  = &X86::GR64RegClass; - -          if (Subtarget->isPICStyleRIPRel()) -            StubAM.Base.Reg = X86::RIP; -        } else { -          Opc = X86::MOV32rm; -          RC  = &X86::GR32RegClass; -        } - -        LoadReg = createResultReg(RC); -        MachineInstrBuilder LoadMI = -          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); -        addFullAddress(LoadMI, StubAM); - -        // Ok, back to normal mode. -        leaveLocalValueArea(SaveInsertPt); - -        // Prevent loading GV stub multiple times in same MBB. -        LocalValueMap[V] = LoadReg; -      } - -      // Now construct the final address. Note that the Disp, Scale, -      // and Index values may already be set here. -      AM.Base.Reg = LoadReg; -      AM.GV = nullptr; -      return true; -    } -  } - -  // If all else fails, try to materialize the value in a register. -  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { -    if (AM.Base.Reg == 0) { -      AM.Base.Reg = getRegForValue(V); -      return AM.Base.Reg != 0; -    } -    if (AM.IndexReg == 0) { -      assert(AM.Scale == 1 && "Scale with no index!"); -      AM.IndexReg = getRegForValue(V); -      return AM.IndexReg != 0; -    } -  } - -  return false; -} - -/// X86SelectAddress - Attempt to fill in an address from the given value. -/// -bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { -  SmallVector<const Value *, 32> GEPs; -redo_gep: -  const User *U = nullptr; -  unsigned Opcode = Instruction::UserOp1; -  if (const Instruction *I = dyn_cast<Instruction>(V)) { -    // Don't walk into other basic blocks; it's possible we haven't -    // visited them yet, so the instructions may not yet be assigned -    // virtual registers. -    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || -        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { -      Opcode = I->getOpcode(); -      U = I; -    } -  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { -    Opcode = C->getOpcode(); -    U = C; -  } - -  if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) -    if (Ty->getAddressSpace() > 255) -      // Fast instruction selection doesn't support the special -      // address spaces. -      return false; - -  switch (Opcode) { -  default: break; -  case Instruction::BitCast: -    // Look past bitcasts. -    return X86SelectAddress(U->getOperand(0), AM); - -  case Instruction::IntToPtr: -    // Look past no-op inttoptrs. -    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) -      return X86SelectAddress(U->getOperand(0), AM); -    break; - -  case Instruction::PtrToInt: -    // Look past no-op ptrtoints. -    if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) -      return X86SelectAddress(U->getOperand(0), AM); -    break; - -  case Instruction::Alloca: { -    // Do static allocas. -    const AllocaInst *A = cast<AllocaInst>(V); -    DenseMap<const AllocaInst *, int>::iterator SI = -      FuncInfo.StaticAllocaMap.find(A); -    if (SI != FuncInfo.StaticAllocaMap.end()) { -      AM.BaseType = X86AddressMode::FrameIndexBase; -      AM.Base.FrameIndex = SI->second; -      return true; -    } -    break; -  } - -  case Instruction::Add: { -    // Adds of constants are common and easy enough. -    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { -      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); -      // They have to fit in the 32-bit signed displacement field though. -      if (isInt<32>(Disp)) { -        AM.Disp = (uint32_t)Disp; -        return X86SelectAddress(U->getOperand(0), AM); -      } -    } -    break; -  } - -  case Instruction::GetElementPtr: { -    X86AddressMode SavedAM = AM; - -    // Pattern-match simple GEPs. -    uint64_t Disp = (int32_t)AM.Disp; -    unsigned IndexReg = AM.IndexReg; -    unsigned Scale = AM.Scale; -    gep_type_iterator GTI = gep_type_begin(U); -    // Iterate through the indices, folding what we can. Constants can be -    // folded, and one dynamic index can be handled, if the scale is supported. -    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); -         i != e; ++i, ++GTI) { -      const Value *Op = *i; -      if (StructType *STy = dyn_cast<StructType>(*GTI)) { -        const StructLayout *SL = DL.getStructLayout(STy); -        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); -        continue; -      } - -      // A array/variable index is always of the form i*S where S is the -      // constant scale size.  See if we can push the scale into immediates. -      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); -      for (;;) { -        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { -          // Constant-offset addressing. -          Disp += CI->getSExtValue() * S; -          break; -        } -        if (canFoldAddIntoGEP(U, Op)) { -          // A compatible add with a constant operand. Fold the constant. -          ConstantInt *CI = -            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); -          Disp += CI->getSExtValue() * S; -          // Iterate on the other operand. -          Op = cast<AddOperator>(Op)->getOperand(0); -          continue; -        } -        if (IndexReg == 0 && -            (!AM.GV || !Subtarget->isPICStyleRIPRel()) && -            (S == 1 || S == 2 || S == 4 || S == 8)) { -          // Scaled-index addressing. -          Scale = S; -          IndexReg = getRegForGEPIndex(Op).first; -          if (IndexReg == 0) -            return false; -          break; -        } -        // Unsupported. -        goto unsupported_gep; -      } -    } - -    // Check for displacement overflow. -    if (!isInt<32>(Disp)) -      break; - -    AM.IndexReg = IndexReg; -    AM.Scale = Scale; -    AM.Disp = (uint32_t)Disp; -    GEPs.push_back(V); - -    if (const GetElementPtrInst *GEP = -          dyn_cast<GetElementPtrInst>(U->getOperand(0))) { -      // Ok, the GEP indices were covered by constant-offset and scaled-index -      // addressing. Update the address state and move on to examining the base. -      V = GEP; -      goto redo_gep; -    } else if (X86SelectAddress(U->getOperand(0), AM)) { -      return true; -    } - -    // If we couldn't merge the gep value into this addr mode, revert back to -    // our address and just match the value instead of completely failing. -    AM = SavedAM; - -    for (SmallVectorImpl<const Value *>::reverse_iterator -           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I) -      if (handleConstantAddresses(*I, AM)) -        return true; - -    return false; -  unsupported_gep: -    // Ok, the GEP indices weren't all covered. -    break; -  } -  } - -  return handleConstantAddresses(V, AM); -} - -/// X86SelectCallAddress - Attempt to fill in an address from the given value. -/// -bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { -  const User *U = nullptr; -  unsigned Opcode = Instruction::UserOp1; -  const Instruction *I = dyn_cast<Instruction>(V); -  // Record if the value is defined in the same basic block. -  // -  // This information is crucial to know whether or not folding an -  // operand is valid. -  // Indeed, FastISel generates or reuses a virtual register for all -  // operands of all instructions it selects. Obviously, the definition and -  // its uses must use the same virtual register otherwise the produced -  // code is incorrect. -  // Before instruction selection, FunctionLoweringInfo::set sets the virtual -  // registers for values that are alive across basic blocks. This ensures -  // that the values are consistently set between across basic block, even -  // if different instruction selection mechanisms are used (e.g., a mix of -  // SDISel and FastISel). -  // For values local to a basic block, the instruction selection process -  // generates these virtual registers with whatever method is appropriate -  // for its needs. In particular, FastISel and SDISel do not share the way -  // local virtual registers are set. -  // Therefore, this is impossible (or at least unsafe) to share values -  // between basic blocks unless they use the same instruction selection -  // method, which is not guarantee for X86. -  // Moreover, things like hasOneUse could not be used accurately, if we -  // allow to reference values across basic blocks whereas they are not -  // alive across basic blocks initially. -  bool InMBB = true; -  if (I) { -    Opcode = I->getOpcode(); -    U = I; -    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); -  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { -    Opcode = C->getOpcode(); -    U = C; -  } - -  switch (Opcode) { -  default: break; -  case Instruction::BitCast: -    // Look past bitcasts if its operand is in the same BB. -    if (InMBB) -      return X86SelectCallAddress(U->getOperand(0), AM); -    break; - -  case Instruction::IntToPtr: -    // Look past no-op inttoptrs if its operand is in the same BB. -    if (InMBB && -        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) -      return X86SelectCallAddress(U->getOperand(0), AM); -    break; - -  case Instruction::PtrToInt: -    // Look past no-op ptrtoints if its operand is in the same BB. -    if (InMBB && -        TLI.getValueType(U->getType()) == TLI.getPointerTy()) -      return X86SelectCallAddress(U->getOperand(0), AM); -    break; -  } - -  // Handle constant address. -  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { -    // Can't handle alternate code models yet. -    if (TM.getCodeModel() != CodeModel::Small) -      return false; - -    // RIP-relative addresses can't have additional register operands. -    if (Subtarget->isPICStyleRIPRel() && -        (AM.Base.Reg != 0 || AM.IndexReg != 0)) -      return false; - -    // Can't handle DLL Import. -    if (GV->hasDLLImportStorageClass()) -      return false; - -    // Can't handle TLS. -    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) -      if (GVar->isThreadLocal()) -        return false; - -    // Okay, we've committed to selecting this global. Set up the basic address. -    AM.GV = GV; - -    // No ABI requires an extra load for anything other than DLLImport, which -    // we rejected above. Return a direct reference to the global. -    if (Subtarget->isPICStyleRIPRel()) { -      // Use rip-relative addressing if we can.  Above we verified that the -      // base and index registers are unused. -      assert(AM.Base.Reg == 0 && AM.IndexReg == 0); -      AM.Base.Reg = X86::RIP; -    } else if (Subtarget->isPICStyleStubPIC()) { -      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; -    } else if (Subtarget->isPICStyleGOT()) { -      AM.GVOpFlags = X86II::MO_GOTOFF; -    } - -    return true; -  } - -  // If all else fails, try to materialize the value in a register. -  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { -    if (AM.Base.Reg == 0) { -      AM.Base.Reg = getRegForValue(V); -      return AM.Base.Reg != 0; -    } -    if (AM.IndexReg == 0) { -      assert(AM.Scale == 1 && "Scale with no index!"); -      AM.IndexReg = getRegForValue(V); -      return AM.IndexReg != 0; -    } -  } - -  return false; -} - - -/// X86SelectStore - Select and emit code to implement store instructions. -bool X86FastISel::X86SelectStore(const Instruction *I) { -  // Atomic stores need special handling. -  const StoreInst *S = cast<StoreInst>(I); - -  if (S->isAtomic()) -    return false; - -  const Value *Val = S->getValueOperand(); -  const Value *Ptr = S->getPointerOperand(); - -  MVT VT; -  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) -    return false; - -  unsigned Alignment = S->getAlignment(); -  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); -  if (Alignment == 0) // Ensure that codegen never sees alignment 0 -    Alignment = ABIAlignment; -  bool Aligned = Alignment >= ABIAlignment; - -  X86AddressMode AM; -  if (!X86SelectAddress(Ptr, AM)) -    return false; - -  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); -} - -/// X86SelectRet - Select and emit code to implement ret instructions. -bool X86FastISel::X86SelectRet(const Instruction *I) { -  const ReturnInst *Ret = cast<ReturnInst>(I); -  const Function &F = *I->getParent()->getParent(); -  const X86MachineFunctionInfo *X86MFInfo = -      FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); - -  if (!FuncInfo.CanLowerReturn) -    return false; - -  CallingConv::ID CC = F.getCallingConv(); -  if (CC != CallingConv::C && -      CC != CallingConv::Fast && -      CC != CallingConv::X86_FastCall && -      CC != CallingConv::X86_64_SysV) -    return false; - -  if (Subtarget->isCallingConvWin64(CC)) -    return false; - -  // Don't handle popping bytes on return for now. -  if (X86MFInfo->getBytesToPopOnReturn() != 0) -    return false; - -  // fastcc with -tailcallopt is intended to provide a guaranteed -  // tail call optimization. Fastisel doesn't know how to do that. -  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) -    return false; - -  // Let SDISel handle vararg functions. -  if (F.isVarArg()) -    return false; - -  // Build a list of return value registers. -  SmallVector<unsigned, 4> RetRegs; - -  if (Ret->getNumOperands() > 0) { -    SmallVector<ISD::OutputArg, 4> Outs; -    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); - -    // Analyze operands of the call, assigning locations to each operand. -    SmallVector<CCValAssign, 16> ValLocs; -    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); -    CCInfo.AnalyzeReturn(Outs, RetCC_X86); - -    const Value *RV = Ret->getOperand(0); -    unsigned Reg = getRegForValue(RV); -    if (Reg == 0) -      return false; - -    // Only handle a single return value for now. -    if (ValLocs.size() != 1) -      return false; - -    CCValAssign &VA = ValLocs[0]; - -    // Don't bother handling odd stuff for now. -    if (VA.getLocInfo() != CCValAssign::Full) -      return false; -    // Only handle register returns for now. -    if (!VA.isRegLoc()) -      return false; - -    // The calling-convention tables for x87 returns don't tell -    // the whole story. -    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) -      return false; - -    unsigned SrcReg = Reg + VA.getValNo(); -    EVT SrcVT = TLI.getValueType(RV->getType()); -    EVT DstVT = VA.getValVT(); -    // Special handling for extended integers. -    if (SrcVT != DstVT) { -      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) -        return false; - -      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) -        return false; - -      assert(DstVT == MVT::i32 && "X86 should always ext to i32"); - -      if (SrcVT == MVT::i1) { -        if (Outs[0].Flags.isSExt()) -          return false; -        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); -        SrcVT = MVT::i8; -      } -      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : -                                             ISD::SIGN_EXTEND; -      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, -                          SrcReg, /*TODO: Kill=*/false); -    } - -    // Make the copy. -    unsigned DstReg = VA.getLocReg(); -    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); -    // Avoid a cross-class copy. This is very unlikely. -    if (!SrcRC->contains(DstReg)) -      return false; -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); - -    // Add register to return instruction. -    RetRegs.push_back(VA.getLocReg()); -  } - -  // The x86-64 ABI for returning structs by value requires that we copy -  // the sret argument into %rax for the return. We saved the argument into -  // a virtual register in the entry block, so now we copy the value out -  // and into %rax. We also do the same with %eax for Win32. -  if (F.hasStructRetAttr() && -      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { -    unsigned Reg = X86MFInfo->getSRetReturnReg(); -    assert(Reg && -           "SRetReturnReg should have been set in LowerFormalArguments()!"); -    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); -    RetRegs.push_back(RetReg); -  } - -  // Now emit the RET. -  MachineInstrBuilder MIB = -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); -  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) -    MIB.addReg(RetRegs[i], RegState::Implicit); -  return true; -} - -/// X86SelectLoad - Select and emit code to implement load instructions. -/// -bool X86FastISel::X86SelectLoad(const Instruction *I) { -  const LoadInst *LI = cast<LoadInst>(I); - -  // Atomic loads need special handling. -  if (LI->isAtomic()) -    return false; - -  MVT VT; -  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) -    return false; - -  const Value *Ptr = LI->getPointerOperand(); - -  X86AddressMode AM; -  if (!X86SelectAddress(Ptr, AM)) -    return false; - -  unsigned ResultReg = 0; -  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) -    return false; - -  updateValueMap(I, ResultReg); -  return true; -} - -static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { -  bool HasAVX = Subtarget->hasAVX(); -  bool X86ScalarSSEf32 = Subtarget->hasSSE1(); -  bool X86ScalarSSEf64 = Subtarget->hasSSE2(); - -  switch (VT.getSimpleVT().SimpleTy) { -  default:       return 0; -  case MVT::i8:  return X86::CMP8rr; -  case MVT::i16: return X86::CMP16rr; -  case MVT::i32: return X86::CMP32rr; -  case MVT::i64: return X86::CMP64rr; -  case MVT::f32: -    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; -  case MVT::f64: -    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; -  } -} - -/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS -/// of the comparison, return an opcode that works for the compare (e.g. -/// CMP32ri) otherwise return 0. -static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { -  switch (VT.getSimpleVT().SimpleTy) { -  // Otherwise, we can't fold the immediate into this comparison. -  default: return 0; -  case MVT::i8: return X86::CMP8ri; -  case MVT::i16: return X86::CMP16ri; -  case MVT::i32: return X86::CMP32ri; -  case MVT::i64: -    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext -    // field. -    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) -      return X86::CMP64ri32; -    return 0; -  } -} - -bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, -                                     EVT VT, DebugLoc CurDbgLoc) { -  unsigned Op0Reg = getRegForValue(Op0); -  if (Op0Reg == 0) return false; - -  // Handle 'null' like i32/i64 0. -  if (isa<ConstantPointerNull>(Op1)) -    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); - -  // We have two options: compare with register or immediate.  If the RHS of -  // the compare is an immediate that we can fold into this compare, use -  // CMPri, otherwise use CMPrr. -  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { -    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) -        .addReg(Op0Reg) -        .addImm(Op1C->getSExtValue()); -      return true; -    } -  } - -  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); -  if (CompareOpc == 0) return false; - -  unsigned Op1Reg = getRegForValue(Op1); -  if (Op1Reg == 0) return false; -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) -    .addReg(Op0Reg) -    .addReg(Op1Reg); - -  return true; -} - -bool X86FastISel::X86SelectCmp(const Instruction *I) { -  const CmpInst *CI = cast<CmpInst>(I); - -  MVT VT; -  if (!isTypeLegal(I->getOperand(0)->getType(), VT)) -    return false; - -  // Try to optimize or fold the cmp. -  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); -  unsigned ResultReg = 0; -  switch (Predicate) { -  default: break; -  case CmpInst::FCMP_FALSE: { -    ResultReg = createResultReg(&X86::GR32RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), -            ResultReg); -    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, -                                           X86::sub_8bit); -    if (!ResultReg) -      return false; -    break; -  } -  case CmpInst::FCMP_TRUE: { -    ResultReg = createResultReg(&X86::GR8RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), -            ResultReg).addImm(1); -    break; -  } -  } - -  if (ResultReg) { -    updateValueMap(I, ResultReg); -    return true; -  } - -  const Value *LHS = CI->getOperand(0); -  const Value *RHS = CI->getOperand(1); - -  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. -  // We don't have to materialize a zero constant for this case and can just use -  // %x again on the RHS. -  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { -    const auto *RHSC = dyn_cast<ConstantFP>(RHS); -    if (RHSC && RHSC->isNullValue()) -      RHS = LHS; -  } - -  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. -  static unsigned SETFOpcTable[2][3] = { -    { X86::SETEr,  X86::SETNPr, X86::AND8rr }, -    { X86::SETNEr, X86::SETPr,  X86::OR8rr  } -  }; -  unsigned *SETFOpc = nullptr; -  switch (Predicate) { -  default: break; -  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; -  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; -  } - -  ResultReg = createResultReg(&X86::GR8RegClass); -  if (SETFOpc) { -    if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) -      return false; - -    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); -    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), -            FlagReg1); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), -            FlagReg2); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), -            ResultReg).addReg(FlagReg1).addReg(FlagReg2); -    updateValueMap(I, ResultReg); -    return true; -  } - -  X86::CondCode CC; -  bool SwapArgs; -  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); -  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); -  unsigned Opc = X86::getSETFromCond(CC); - -  if (SwapArgs) -    std::swap(LHS, RHS); - -  // Emit a compare of LHS/RHS. -  if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) -    return false; - -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::X86SelectZExt(const Instruction *I) { -  EVT DstVT = TLI.getValueType(I->getType()); -  if (!TLI.isTypeLegal(DstVT)) -    return false; - -  unsigned ResultReg = getRegForValue(I->getOperand(0)); -  if (ResultReg == 0) -    return false; - -  // Handle zero-extension from i1 to i8, which is common. -  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); -  if (SrcVT.SimpleTy == MVT::i1) { -    // Set the high bits to zero. -    ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); -    SrcVT = MVT::i8; - -    if (ResultReg == 0) -      return false; -  } - -  if (DstVT == MVT::i64) { -    // Handle extension to 64-bits via sub-register shenanigans. -    unsigned MovInst; - -    switch (SrcVT.SimpleTy) { -    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break; -    case MVT::i16: MovInst = X86::MOVZX32rr16; break; -    case MVT::i32: MovInst = X86::MOV32rr;     break; -    default: llvm_unreachable("Unexpected zext to i64 source type"); -    } - -    unsigned Result32 = createResultReg(&X86::GR32RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) -      .addReg(ResultReg); - -    ResultReg = createResultReg(&X86::GR64RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), -            ResultReg) -      .addImm(0).addReg(Result32).addImm(X86::sub_32bit); -  } else if (DstVT != MVT::i8) { -    ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, -                           ResultReg, /*Kill=*/true); -    if (ResultReg == 0) -      return false; -  } - -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::X86SelectBranch(const Instruction *I) { -  // Unconditional branches are selected by tablegen-generated code. -  // Handle a conditional branch. -  const BranchInst *BI = cast<BranchInst>(I); -  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; -  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - -  // Fold the common case of a conditional branch with a comparison -  // in the same block (values defined on other blocks may not have -  // initialized registers). -  X86::CondCode CC; -  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { -    if (CI->hasOneUse() && CI->getParent() == I->getParent()) { -      EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); - -      // Try to optimize or fold the cmp. -      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); -      switch (Predicate) { -      default: break; -      case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; -      case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true; -      } - -      const Value *CmpLHS = CI->getOperand(0); -      const Value *CmpRHS = CI->getOperand(1); - -      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, -      // 0.0. -      // We don't have to materialize a zero constant for this case and can just -      // use %x again on the RHS. -      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { -        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); -        if (CmpRHSC && CmpRHSC->isNullValue()) -          CmpRHS = CmpLHS; -      } - -      // Try to take advantage of fallthrough opportunities. -      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { -        std::swap(TrueMBB, FalseMBB); -        Predicate = CmpInst::getInversePredicate(Predicate); -      } - -      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition -      // code check. Instead two branch instructions are required to check all -      // the flags. First we change the predicate to a supported condition code, -      // which will be the first branch. Later one we will emit the second -      // branch. -      bool NeedExtraBranch = false; -      switch (Predicate) { -      default: break; -      case CmpInst::FCMP_OEQ: -        std::swap(TrueMBB, FalseMBB); // fall-through -      case CmpInst::FCMP_UNE: -        NeedExtraBranch = true; -        Predicate = CmpInst::FCMP_ONE; -        break; -      } - -      bool SwapArgs; -      unsigned BranchOpc; -      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); -      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - -      BranchOpc = X86::GetCondBranchFromCond(CC); -      if (SwapArgs) -        std::swap(CmpLHS, CmpRHS); - -      // Emit a compare of the LHS and RHS, setting the flags. -      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) -        return false; - -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) -        .addMBB(TrueMBB); - -      // X86 requires a second branch to handle UNE (and OEQ, which is mapped -      // to UNE above). -      if (NeedExtraBranch) { -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) -          .addMBB(TrueMBB); -      } - -      // Obtain the branch weight and add the TrueBB to the successor list. -      uint32_t BranchWeight = 0; -      if (FuncInfo.BPI) -        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), -                                                   TrueMBB->getBasicBlock()); -      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); - -      // Emits an unconditional branch to the FalseBB, obtains the branch -      // weight, and adds it to the successor list. -      fastEmitBranch(FalseMBB, DbgLoc); - -      return true; -    } -  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { -    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which -    // typically happen for _Bool and C++ bools. -    MVT SourceVT; -    if (TI->hasOneUse() && TI->getParent() == I->getParent() && -        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { -      unsigned TestOpc = 0; -      switch (SourceVT.SimpleTy) { -      default: break; -      case MVT::i8:  TestOpc = X86::TEST8ri; break; -      case MVT::i16: TestOpc = X86::TEST16ri; break; -      case MVT::i32: TestOpc = X86::TEST32ri; break; -      case MVT::i64: TestOpc = X86::TEST64ri32; break; -      } -      if (TestOpc) { -        unsigned OpReg = getRegForValue(TI->getOperand(0)); -        if (OpReg == 0) return false; -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) -          .addReg(OpReg).addImm(1); - -        unsigned JmpOpc = X86::JNE_1; -        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { -          std::swap(TrueMBB, FalseMBB); -          JmpOpc = X86::JE_1; -        } - -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) -          .addMBB(TrueMBB); -        fastEmitBranch(FalseMBB, DbgLoc); -        uint32_t BranchWeight = 0; -        if (FuncInfo.BPI) -          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), -                                                     TrueMBB->getBasicBlock()); -        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); -        return true; -      } -    } -  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { -    // Fake request the condition, otherwise the intrinsic might be completely -    // optimized away. -    unsigned TmpReg = getRegForValue(BI->getCondition()); -    if (TmpReg == 0) -      return false; - -    unsigned BranchOpc = X86::GetCondBranchFromCond(CC); - -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) -      .addMBB(TrueMBB); -    fastEmitBranch(FalseMBB, DbgLoc); -    uint32_t BranchWeight = 0; -    if (FuncInfo.BPI) -      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), -                                                 TrueMBB->getBasicBlock()); -    FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); -    return true; -  } - -  // Otherwise do a clumsy setcc and re-test it. -  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used -  // in an explicit cast, so make sure to handle that correctly. -  unsigned OpReg = getRegForValue(BI->getCondition()); -  if (OpReg == 0) return false; - -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) -    .addReg(OpReg).addImm(1); -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) -    .addMBB(TrueMBB); -  fastEmitBranch(FalseMBB, DbgLoc); -  uint32_t BranchWeight = 0; -  if (FuncInfo.BPI) -    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), -                                               TrueMBB->getBasicBlock()); -  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); -  return true; -} - -bool X86FastISel::X86SelectShift(const Instruction *I) { -  unsigned CReg = 0, OpReg = 0; -  const TargetRegisterClass *RC = nullptr; -  if (I->getType()->isIntegerTy(8)) { -    CReg = X86::CL; -    RC = &X86::GR8RegClass; -    switch (I->getOpcode()) { -    case Instruction::LShr: OpReg = X86::SHR8rCL; break; -    case Instruction::AShr: OpReg = X86::SAR8rCL; break; -    case Instruction::Shl:  OpReg = X86::SHL8rCL; break; -    default: return false; -    } -  } else if (I->getType()->isIntegerTy(16)) { -    CReg = X86::CX; -    RC = &X86::GR16RegClass; -    switch (I->getOpcode()) { -    case Instruction::LShr: OpReg = X86::SHR16rCL; break; -    case Instruction::AShr: OpReg = X86::SAR16rCL; break; -    case Instruction::Shl:  OpReg = X86::SHL16rCL; break; -    default: return false; -    } -  } else if (I->getType()->isIntegerTy(32)) { -    CReg = X86::ECX; -    RC = &X86::GR32RegClass; -    switch (I->getOpcode()) { -    case Instruction::LShr: OpReg = X86::SHR32rCL; break; -    case Instruction::AShr: OpReg = X86::SAR32rCL; break; -    case Instruction::Shl:  OpReg = X86::SHL32rCL; break; -    default: return false; -    } -  } else if (I->getType()->isIntegerTy(64)) { -    CReg = X86::RCX; -    RC = &X86::GR64RegClass; -    switch (I->getOpcode()) { -    case Instruction::LShr: OpReg = X86::SHR64rCL; break; -    case Instruction::AShr: OpReg = X86::SAR64rCL; break; -    case Instruction::Shl:  OpReg = X86::SHL64rCL; break; -    default: return false; -    } -  } else { -    return false; -  } - -  MVT VT; -  if (!isTypeLegal(I->getType(), VT)) -    return false; - -  unsigned Op0Reg = getRegForValue(I->getOperand(0)); -  if (Op0Reg == 0) return false; - -  unsigned Op1Reg = getRegForValue(I->getOperand(1)); -  if (Op1Reg == 0) return false; -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), -          CReg).addReg(Op1Reg); - -  // The shift instruction uses X86::CL. If we defined a super-register -  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. -  if (CReg != X86::CL) -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::KILL), X86::CL) -      .addReg(CReg, RegState::Kill); - -  unsigned ResultReg = createResultReg(RC); -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) -    .addReg(Op0Reg); -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::X86SelectDivRem(const Instruction *I) { -  const static unsigned NumTypes = 4; // i8, i16, i32, i64 -  const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem -  const static bool S = true;  // IsSigned -  const static bool U = false; // !IsSigned -  const static unsigned Copy = TargetOpcode::COPY; -  // For the X86 DIV/IDIV instruction, in most cases the dividend -  // (numerator) must be in a specific register pair highreg:lowreg, -  // producing the quotient in lowreg and the remainder in highreg. -  // For most data types, to set up the instruction, the dividend is -  // copied into lowreg, and lowreg is sign-extended or zero-extended -  // into highreg.  The exception is i8, where the dividend is defined -  // as a single register rather than a register pair, and we -  // therefore directly sign-extend or zero-extend the dividend into -  // lowreg, instead of copying, and ignore the highreg. -  const static struct DivRemEntry { -    // The following portion depends only on the data type. -    const TargetRegisterClass *RC; -    unsigned LowInReg;  // low part of the register pair -    unsigned HighInReg; // high part of the register pair -    // The following portion depends on both the data type and the operation. -    struct DivRemResult { -    unsigned OpDivRem;        // The specific DIV/IDIV opcode to use. -    unsigned OpSignExtend;    // Opcode for sign-extending lowreg into -                              // highreg, or copying a zero into highreg. -    unsigned OpCopy;          // Opcode for copying dividend into lowreg, or -                              // zero/sign-extending into lowreg for i8. -    unsigned DivRemResultReg; // Register containing the desired result. -    bool IsOpSigned;          // Whether to use signed or unsigned form. -    } ResultTable[NumOps]; -  } OpTable[NumTypes] = { -    { &X86::GR8RegClass,  X86::AX,  0, { -        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv -        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem -        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv -        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem -      } -    }, // i8 -    { &X86::GR16RegClass, X86::AX,  X86::DX, { -        { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv -        { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem -        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv -        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem -      } -    }, // i16 -    { &X86::GR32RegClass, X86::EAX, X86::EDX, { -        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv -        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem -        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv -        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem -      } -    }, // i32 -    { &X86::GR64RegClass, X86::RAX, X86::RDX, { -        { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv -        { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem -        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv -        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem -      } -    }, // i64 -  }; - -  MVT VT; -  if (!isTypeLegal(I->getType(), VT)) -    return false; - -  unsigned TypeIndex, OpIndex; -  switch (VT.SimpleTy) { -  default: return false; -  case MVT::i8:  TypeIndex = 0; break; -  case MVT::i16: TypeIndex = 1; break; -  case MVT::i32: TypeIndex = 2; break; -  case MVT::i64: TypeIndex = 3; -    if (!Subtarget->is64Bit()) -      return false; -    break; -  } - -  switch (I->getOpcode()) { -  default: llvm_unreachable("Unexpected div/rem opcode"); -  case Instruction::SDiv: OpIndex = 0; break; -  case Instruction::SRem: OpIndex = 1; break; -  case Instruction::UDiv: OpIndex = 2; break; -  case Instruction::URem: OpIndex = 3; break; -  } - -  const DivRemEntry &TypeEntry = OpTable[TypeIndex]; -  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; -  unsigned Op0Reg = getRegForValue(I->getOperand(0)); -  if (Op0Reg == 0) -    return false; -  unsigned Op1Reg = getRegForValue(I->getOperand(1)); -  if (Op1Reg == 0) -    return false; - -  // Move op0 into low-order input register. -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -          TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); -  // Zero-extend or sign-extend into high-order input register. -  if (OpEntry.OpSignExtend) { -    if (OpEntry.IsOpSigned) -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(OpEntry.OpSignExtend)); -    else { -      unsigned Zero32 = createResultReg(&X86::GR32RegClass); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(X86::MOV32r0), Zero32); - -      // Copy the zero into the appropriate sub/super/identical physical -      // register. Unfortunately the operations needed are not uniform enough -      // to fit neatly into the table above. -      if (VT.SimpleTy == MVT::i16) { -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                TII.get(Copy), TypeEntry.HighInReg) -          .addReg(Zero32, 0, X86::sub_16bit); -      } else if (VT.SimpleTy == MVT::i32) { -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                TII.get(Copy), TypeEntry.HighInReg) -            .addReg(Zero32); -      } else if (VT.SimpleTy == MVT::i64) { -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) -            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); -      } -    } -  } -  // Generate the DIV/IDIV instruction. -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -          TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); -  // For i8 remainder, we can't reference AH directly, as we'll end -  // up with bogus copies like %R9B = COPY %AH. Reference AX -  // instead to prevent AH references in a REX instruction. -  // -  // The current assumption of the fast register allocator is that isel -  // won't generate explicit references to the GPR8_NOREX registers. If -  // the allocator and/or the backend get enhanced to be more robust in -  // that regard, this can be, and should be, removed. -  unsigned ResultReg = 0; -  if ((I->getOpcode() == Instruction::SRem || -       I->getOpcode() == Instruction::URem) && -      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { -    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); -    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(Copy), SourceSuperReg).addReg(X86::AX); - -    // Shift AX right by 8 bits instead of using AH. -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), -            ResultSuperReg).addReg(SourceSuperReg).addImm(8); - -    // Now reference the 8-bit subreg of the result. -    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, -                                           /*Kill=*/true, X86::sub_8bit); -  } -  // Copy the result out of the physreg if we haven't already. -  if (!ResultReg) { -    ResultReg = createResultReg(TypeEntry.RC); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) -        .addReg(OpEntry.DivRemResultReg); -  } -  updateValueMap(I, ResultReg); - -  return true; -} - -/// \brief Emit a conditional move instruction (if the are supported) to lower -/// the select. -bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { -  // Check if the subtarget supports these instructions. -  if (!Subtarget->hasCMov()) -    return false; - -  // FIXME: Add support for i8. -  if (RetVT < MVT::i16 || RetVT > MVT::i64) -    return false; - -  const Value *Cond = I->getOperand(0); -  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); -  bool NeedTest = true; -  X86::CondCode CC = X86::COND_NE; - -  // Optimize conditions coming from a compare if both instructions are in the -  // same basic block (values defined in other basic blocks may not have -  // initialized registers). -  const auto *CI = dyn_cast<CmpInst>(Cond); -  if (CI && (CI->getParent() == I->getParent())) { -    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); - -    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. -    static unsigned SETFOpcTable[2][3] = { -      { X86::SETNPr, X86::SETEr , X86::TEST8rr }, -      { X86::SETPr,  X86::SETNEr, X86::OR8rr   } -    }; -    unsigned *SETFOpc = nullptr; -    switch (Predicate) { -    default: break; -    case CmpInst::FCMP_OEQ: -      SETFOpc = &SETFOpcTable[0][0]; -      Predicate = CmpInst::ICMP_NE; -      break; -    case CmpInst::FCMP_UNE: -      SETFOpc = &SETFOpcTable[1][0]; -      Predicate = CmpInst::ICMP_NE; -      break; -    } - -    bool NeedSwap; -    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); -    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); - -    const Value *CmpLHS = CI->getOperand(0); -    const Value *CmpRHS = CI->getOperand(1); -    if (NeedSwap) -      std::swap(CmpLHS, CmpRHS); - -    EVT CmpVT = TLI.getValueType(CmpLHS->getType()); -    // Emit a compare of the LHS and RHS, setting the flags. -    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) -      return false; - -    if (SETFOpc) { -      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); -      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), -              FlagReg1); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), -              FlagReg2); -      auto const &II = TII.get(SETFOpc[2]); -      if (II.getNumDefs()) { -        unsigned TmpReg = createResultReg(&X86::GR8RegClass); -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) -          .addReg(FlagReg2).addReg(FlagReg1); -      } else { -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) -          .addReg(FlagReg2).addReg(FlagReg1); -      } -    } -    NeedTest = false; -  } else if (foldX86XALUIntrinsic(CC, I, Cond)) { -    // Fake request the condition, otherwise the intrinsic might be completely -    // optimized away. -    unsigned TmpReg = getRegForValue(Cond); -    if (TmpReg == 0) -      return false; - -    NeedTest = false; -  } - -  if (NeedTest) { -    // Selects operate on i1, however, CondReg is 8 bits width and may contain -    // garbage. Indeed, only the less significant bit is supposed to be -    // accurate. If we read more than the lsb, we may see non-zero values -    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for -    // the select. This is achieved by performing TEST against 1. -    unsigned CondReg = getRegForValue(Cond); -    if (CondReg == 0) -      return false; -    bool CondIsKill = hasTrivialKill(Cond); - -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) -      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); -  } - -  const Value *LHS = I->getOperand(1); -  const Value *RHS = I->getOperand(2); - -  unsigned RHSReg = getRegForValue(RHS); -  bool RHSIsKill = hasTrivialKill(RHS); - -  unsigned LHSReg = getRegForValue(LHS); -  bool LHSIsKill = hasTrivialKill(LHS); - -  if (!LHSReg || !RHSReg) -    return false; - -  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); -  unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, -                                       LHSReg, LHSIsKill); -  updateValueMap(I, ResultReg); -  return true; -} - -/// \brief Emit SSE instructions to lower the select. -/// -/// Try to use SSE1/SSE2 instructions to simulate a select without branches. -/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary -/// SSE instructions are available. -bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { -  // Optimize conditions coming from a compare if both instructions are in the -  // same basic block (values defined in other basic blocks may not have -  // initialized registers). -  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); -  if (!CI || (CI->getParent() != I->getParent())) -    return false; - -  if (I->getType() != CI->getOperand(0)->getType() || -      !((Subtarget->hasSSE1() && RetVT == MVT::f32) || -        (Subtarget->hasSSE2() && RetVT == MVT::f64))) -    return false; - -  const Value *CmpLHS = CI->getOperand(0); -  const Value *CmpRHS = CI->getOperand(1); -  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); - -  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. -  // We don't have to materialize a zero constant for this case and can just use -  // %x again on the RHS. -  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { -    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); -    if (CmpRHSC && CmpRHSC->isNullValue()) -      CmpRHS = CmpLHS; -  } - -  unsigned CC; -  bool NeedSwap; -  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); -  if (CC > 7) -    return false; - -  if (NeedSwap) -    std::swap(CmpLHS, CmpRHS); - -  static unsigned OpcTable[2][2][4] = { -    { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  }, -      { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  }, -    { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  }, -      { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  } -  }; - -  bool HasAVX = Subtarget->hasAVX(); -  unsigned *Opc = nullptr; -  switch (RetVT.SimpleTy) { -  default: return false; -  case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; -  case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; -  } - -  const Value *LHS = I->getOperand(1); -  const Value *RHS = I->getOperand(2); - -  unsigned LHSReg = getRegForValue(LHS); -  bool LHSIsKill = hasTrivialKill(LHS); - -  unsigned RHSReg = getRegForValue(RHS); -  bool RHSIsKill = hasTrivialKill(RHS); - -  unsigned CmpLHSReg = getRegForValue(CmpLHS); -  bool CmpLHSIsKill = hasTrivialKill(CmpLHS); - -  unsigned CmpRHSReg = getRegForValue(CmpRHS); -  bool CmpRHSIsKill = hasTrivialKill(CmpRHS); - -  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) -    return false; - -  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); -  unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, -                                     CmpRHSReg, CmpRHSIsKill, CC); -  unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, -                                    LHSReg, LHSIsKill); -  unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, -                                     RHSReg, RHSIsKill); -  unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, -                                       AndReg, /*IsKill=*/true); -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { -  // These are pseudo CMOV instructions and will be later expanded into control- -  // flow. -  unsigned Opc; -  switch (RetVT.SimpleTy) { -  default: return false; -  case MVT::i8:  Opc = X86::CMOV_GR8;  break; -  case MVT::i16: Opc = X86::CMOV_GR16; break; -  case MVT::i32: Opc = X86::CMOV_GR32; break; -  case MVT::f32: Opc = X86::CMOV_FR32; break; -  case MVT::f64: Opc = X86::CMOV_FR64; break; -  } - -  const Value *Cond = I->getOperand(0); -  X86::CondCode CC = X86::COND_NE; - -  // Optimize conditions coming from a compare if both instructions are in the -  // same basic block (values defined in other basic blocks may not have -  // initialized registers). -  const auto *CI = dyn_cast<CmpInst>(Cond); -  if (CI && (CI->getParent() == I->getParent())) { -    bool NeedSwap; -    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); -    if (CC > X86::LAST_VALID_COND) -      return false; - -    const Value *CmpLHS = CI->getOperand(0); -    const Value *CmpRHS = CI->getOperand(1); - -    if (NeedSwap) -      std::swap(CmpLHS, CmpRHS); - -    EVT CmpVT = TLI.getValueType(CmpLHS->getType()); -    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) -      return false; -  } else { -    unsigned CondReg = getRegForValue(Cond); -    if (CondReg == 0) -      return false; -    bool CondIsKill = hasTrivialKill(Cond); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) -      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); -  } - -  const Value *LHS = I->getOperand(1); -  const Value *RHS = I->getOperand(2); - -  unsigned LHSReg = getRegForValue(LHS); -  bool LHSIsKill = hasTrivialKill(LHS); - -  unsigned RHSReg = getRegForValue(RHS); -  bool RHSIsKill = hasTrivialKill(RHS); - -  if (!LHSReg || !RHSReg) -    return false; - -  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - -  unsigned ResultReg = -    fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::X86SelectSelect(const Instruction *I) { -  MVT RetVT; -  if (!isTypeLegal(I->getType(), RetVT)) -    return false; - -  // Check if we can fold the select. -  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { -    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); -    const Value *Opnd = nullptr; -    switch (Predicate) { -    default:                              break; -    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; -    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break; -    } -    // No need for a select anymore - this is an unconditional move. -    if (Opnd) { -      unsigned OpReg = getRegForValue(Opnd); -      if (OpReg == 0) -        return false; -      bool OpIsKill = hasTrivialKill(Opnd); -      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); -      unsigned ResultReg = createResultReg(RC); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(TargetOpcode::COPY), ResultReg) -        .addReg(OpReg, getKillRegState(OpIsKill)); -      updateValueMap(I, ResultReg); -      return true; -    } -  } - -  // First try to use real conditional move instructions. -  if (X86FastEmitCMoveSelect(RetVT, I)) -    return true; - -  // Try to use a sequence of SSE instructions to simulate a conditional move. -  if (X86FastEmitSSESelect(RetVT, I)) -    return true; - -  // Fall-back to pseudo conditional move instructions, which will be later -  // converted to control-flow. -  if (X86FastEmitPseudoSelect(RetVT, I)) -    return true; - -  return false; -} - -bool X86FastISel::X86SelectFPExt(const Instruction *I) { -  // fpext from float to double. -  if (X86ScalarSSEf64 && -      I->getType()->isDoubleTy()) { -    const Value *V = I->getOperand(0); -    if (V->getType()->isFloatTy()) { -      unsigned OpReg = getRegForValue(V); -      if (OpReg == 0) return false; -      unsigned ResultReg = createResultReg(&X86::FR64RegClass); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(X86::CVTSS2SDrr), ResultReg) -        .addReg(OpReg); -      updateValueMap(I, ResultReg); -      return true; -    } -  } - -  return false; -} - -bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { -  if (X86ScalarSSEf64) { -    if (I->getType()->isFloatTy()) { -      const Value *V = I->getOperand(0); -      if (V->getType()->isDoubleTy()) { -        unsigned OpReg = getRegForValue(V); -        if (OpReg == 0) return false; -        unsigned ResultReg = createResultReg(&X86::FR32RegClass); -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                TII.get(X86::CVTSD2SSrr), ResultReg) -          .addReg(OpReg); -        updateValueMap(I, ResultReg); -        return true; -      } -    } -  } - -  return false; -} - -bool X86FastISel::X86SelectTrunc(const Instruction *I) { -  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); -  EVT DstVT = TLI.getValueType(I->getType()); - -  // This code only handles truncation to byte. -  if (DstVT != MVT::i8 && DstVT != MVT::i1) -    return false; -  if (!TLI.isTypeLegal(SrcVT)) -    return false; - -  unsigned InputReg = getRegForValue(I->getOperand(0)); -  if (!InputReg) -    // Unhandled operand.  Halt "fast" selection and bail. -    return false; - -  if (SrcVT == MVT::i8) { -    // Truncate from i8 to i1; no code needed. -    updateValueMap(I, InputReg); -    return true; -  } - -  if (!Subtarget->is64Bit()) { -    // If we're on x86-32; we can't extract an i8 from a general register. -    // First issue a copy to GR16_ABCD or GR32_ABCD. -    const TargetRegisterClass *CopyRC = -      (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; -    unsigned CopyReg = createResultReg(CopyRC); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); -    InputReg = CopyReg; -  } - -  // Issue an extract_subreg. -  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, -                                                  InputReg, /*Kill=*/true, -                                                  X86::sub_8bit); -  if (!ResultReg) -    return false; - -  updateValueMap(I, ResultReg); -  return true; -} - -bool X86FastISel::IsMemcpySmall(uint64_t Len) { -  return Len <= (Subtarget->is64Bit() ? 32 : 16); -} - -bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, -                                     X86AddressMode SrcAM, uint64_t Len) { - -  // Make sure we don't bloat code by inlining very large memcpy's. -  if (!IsMemcpySmall(Len)) -    return false; - -  bool i64Legal = Subtarget->is64Bit(); - -  // We don't care about alignment here since we just emit integer accesses. -  while (Len) { -    MVT VT; -    if (Len >= 8 && i64Legal) -      VT = MVT::i64; -    else if (Len >= 4) -      VT = MVT::i32; -    else if (Len >= 2) -      VT = MVT::i16; -    else -      VT = MVT::i8; - -    unsigned Reg; -    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); -    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); -    assert(RV && "Failed to emit load or store??"); - -    unsigned Size = VT.getSizeInBits()/8; -    Len -= Size; -    DestAM.Disp += Size; -    SrcAM.Disp += Size; -  } - -  return true; -} - -bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { -  // FIXME: Handle more intrinsics. -  switch (II->getIntrinsicID()) { -  default: return false; -  case Intrinsic::frameaddress: { -    Type *RetTy = II->getCalledFunction()->getReturnType(); - -    MVT VT; -    if (!isTypeLegal(RetTy, VT)) -      return false; - -    unsigned Opc; -    const TargetRegisterClass *RC = nullptr; - -    switch (VT.SimpleTy) { -    default: llvm_unreachable("Invalid result type for frameaddress."); -    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; -    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; -    } - -    // This needs to be set before we call getPtrSizedFrameRegister, otherwise -    // we get the wrong frame register. -    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); -    MFI->setFrameAddressIsTaken(true); - -    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( -        TM.getSubtargetImpl()->getRegisterInfo()); -    unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF)); -    assert(((FrameReg == X86::RBP && VT == MVT::i64) || -            (FrameReg == X86::EBP && VT == MVT::i32)) && -           "Invalid Frame Register!"); - -    // Always make a copy of the frame register to to a vreg first, so that we -    // never directly reference the frame register (the TwoAddressInstruction- -    // Pass doesn't like that). -    unsigned SrcReg = createResultReg(RC); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); - -    // Now recursively load from the frame address. -    // movq (%rbp), %rax -    // movq (%rax), %rax -    // movq (%rax), %rax -    // ... -    unsigned DestReg; -    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); -    while (Depth--) { -      DestReg = createResultReg(RC); -      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                           TII.get(Opc), DestReg), SrcReg); -      SrcReg = DestReg; -    } - -    updateValueMap(II, SrcReg); -    return true; -  } -  case Intrinsic::memcpy: { -    const MemCpyInst *MCI = cast<MemCpyInst>(II); -    // Don't handle volatile or variable length memcpys. -    if (MCI->isVolatile()) -      return false; - -    if (isa<ConstantInt>(MCI->getLength())) { -      // Small memcpy's are common enough that we want to do them -      // without a call if possible. -      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue(); -      if (IsMemcpySmall(Len)) { -        X86AddressMode DestAM, SrcAM; -        if (!X86SelectAddress(MCI->getRawDest(), DestAM) || -            !X86SelectAddress(MCI->getRawSource(), SrcAM)) -          return false; -        TryEmitSmallMemcpy(DestAM, SrcAM, Len); -        return true; -      } -    } - -    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; -    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) -      return false; - -    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) -      return false; - -    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); -  } -  case Intrinsic::memset: { -    const MemSetInst *MSI = cast<MemSetInst>(II); - -    if (MSI->isVolatile()) -      return false; - -    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; -    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) -      return false; - -    if (MSI->getDestAddressSpace() > 255) -      return false; - -    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); -  } -  case Intrinsic::stackprotector: { -    // Emit code to store the stack guard onto the stack. -    EVT PtrTy = TLI.getPointerTy(); - -    const Value *Op1 = II->getArgOperand(0); // The guard's value. -    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); - -    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); - -    // Grab the frame index. -    X86AddressMode AM; -    if (!X86SelectAddress(Slot, AM)) return false; -    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; -    return true; -  } -  case Intrinsic::dbg_declare: { -    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); -    X86AddressMode AM; -    assert(DI->getAddress() && "Null address should be checked earlier!"); -    if (!X86SelectAddress(DI->getAddress(), AM)) -      return false; -    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); -    // FIXME may need to add RegState::Debug to any registers produced, -    // although ESP/EBP should be the only ones at the moment. -    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) -        .addImm(0) -        .addMetadata(DI->getVariable()) -        .addMetadata(DI->getExpression()); -    return true; -  } -  case Intrinsic::trap: { -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); -    return true; -  } -  case Intrinsic::sqrt: { -    if (!Subtarget->hasSSE1()) -      return false; - -    Type *RetTy = II->getCalledFunction()->getReturnType(); - -    MVT VT; -    if (!isTypeLegal(RetTy, VT)) -      return false; - -    // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT -    // is not generated by FastISel yet. -    // FIXME: Update this code once tablegen can handle it. -    static const unsigned SqrtOpc[2][2] = { -      {X86::SQRTSSr, X86::VSQRTSSr}, -      {X86::SQRTSDr, X86::VSQRTSDr} -    }; -    bool HasAVX = Subtarget->hasAVX(); -    unsigned Opc; -    const TargetRegisterClass *RC; -    switch (VT.SimpleTy) { -    default: return false; -    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; -    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; -    } - -    const Value *SrcVal = II->getArgOperand(0); -    unsigned SrcReg = getRegForValue(SrcVal); - -    if (SrcReg == 0) -      return false; - -    unsigned ImplicitDefReg = 0; -    if (HasAVX) { -      ImplicitDefReg = createResultReg(RC); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); -    } - -    unsigned ResultReg = createResultReg(RC); -    MachineInstrBuilder MIB; -    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), -                  ResultReg); - -    if (ImplicitDefReg) -      MIB.addReg(ImplicitDefReg); - -    MIB.addReg(SrcReg); - -    updateValueMap(II, ResultReg); -    return true; -  } -  case Intrinsic::sadd_with_overflow: -  case Intrinsic::uadd_with_overflow: -  case Intrinsic::ssub_with_overflow: -  case Intrinsic::usub_with_overflow: -  case Intrinsic::smul_with_overflow: -  case Intrinsic::umul_with_overflow: { -    // This implements the basic lowering of the xalu with overflow intrinsics -    // into add/sub/mul followed by either seto or setb. -    const Function *Callee = II->getCalledFunction(); -    auto *Ty = cast<StructType>(Callee->getReturnType()); -    Type *RetTy = Ty->getTypeAtIndex(0U); -    Type *CondTy = Ty->getTypeAtIndex(1); - -    MVT VT; -    if (!isTypeLegal(RetTy, VT)) -      return false; - -    if (VT < MVT::i8 || VT > MVT::i64) -      return false; - -    const Value *LHS = II->getArgOperand(0); -    const Value *RHS = II->getArgOperand(1); - -    // Canonicalize immediate to the RHS. -    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && -        isCommutativeIntrinsic(II)) -      std::swap(LHS, RHS); - -    bool UseIncDec = false; -    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne()) -      UseIncDec = true; - -    unsigned BaseOpc, CondOpc; -    switch (II->getIntrinsicID()) { -    default: llvm_unreachable("Unexpected intrinsic!"); -    case Intrinsic::sadd_with_overflow: -      BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); -      CondOpc = X86::SETOr; -      break; -    case Intrinsic::uadd_with_overflow: -      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; -    case Intrinsic::ssub_with_overflow: -      BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); -      CondOpc = X86::SETOr; -      break; -    case Intrinsic::usub_with_overflow: -      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; -    case Intrinsic::smul_with_overflow: -      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; -    case Intrinsic::umul_with_overflow: -      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; -    } - -    unsigned LHSReg = getRegForValue(LHS); -    if (LHSReg == 0) -      return false; -    bool LHSIsKill = hasTrivialKill(LHS); - -    unsigned ResultReg = 0; -    // Check if we have an immediate version. -    if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { -      static const unsigned Opc[2][4] = { -        { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, -        { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } -      }; - -      if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { -        ResultReg = createResultReg(TLI.getRegClassFor(VT)); -        bool IsDec = BaseOpc == X86ISD::DEC; -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) -          .addReg(LHSReg, getKillRegState(LHSIsKill)); -      } else -        ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, -                                CI->getZExtValue()); -    } - -    unsigned RHSReg; -    bool RHSIsKill; -    if (!ResultReg) { -      RHSReg = getRegForValue(RHS); -      if (RHSReg == 0) -        return false; -      RHSIsKill = hasTrivialKill(RHS); -      ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, -                              RHSIsKill); -    } - -    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit -    // it manually. -    if (BaseOpc == X86ISD::UMUL && !ResultReg) { -      static const unsigned MULOpc[] = -        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; -      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; -      // First copy the first operand into RAX, which is an implicit input to -      // the X86::MUL*r instruction. -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) -        .addReg(LHSReg, getKillRegState(LHSIsKill)); -      ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], -                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill); -    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { -      static const unsigned MULOpc[] = -        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; -      if (VT == MVT::i8) { -        // Copy the first operand into AL, which is an implicit input to the -        // X86::IMUL8r instruction. -        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -               TII.get(TargetOpcode::COPY), X86::AL) -          .addReg(LHSReg, getKillRegState(LHSIsKill)); -        ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, -                                   RHSIsKill); -      } else -        ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], -                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill, -                                    RHSReg, RHSIsKill); -    } - -    if (!ResultReg) -      return false; - -    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); -    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), -            ResultReg2); - -    updateValueMap(II, ResultReg, 2); -    return true; -  } -  case Intrinsic::x86_sse_cvttss2si: -  case Intrinsic::x86_sse_cvttss2si64: -  case Intrinsic::x86_sse2_cvttsd2si: -  case Intrinsic::x86_sse2_cvttsd2si64: { -    bool IsInputDouble; -    switch (II->getIntrinsicID()) { -    default: llvm_unreachable("Unexpected intrinsic."); -    case Intrinsic::x86_sse_cvttss2si: -    case Intrinsic::x86_sse_cvttss2si64: -      if (!Subtarget->hasSSE1()) -        return false; -      IsInputDouble = false; -      break; -    case Intrinsic::x86_sse2_cvttsd2si: -    case Intrinsic::x86_sse2_cvttsd2si64: -      if (!Subtarget->hasSSE2()) -        return false; -      IsInputDouble = true; -      break; -    } - -    Type *RetTy = II->getCalledFunction()->getReturnType(); -    MVT VT; -    if (!isTypeLegal(RetTy, VT)) -      return false; - -    static const unsigned CvtOpc[2][2][2] = { -      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   }, -        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  }, -      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   }, -        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  } -    }; -    bool HasAVX = Subtarget->hasAVX(); -    unsigned Opc; -    switch (VT.SimpleTy) { -    default: llvm_unreachable("Unexpected result type."); -    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; -    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; -    } - -    // Check if we can fold insertelement instructions into the convert. -    const Value *Op = II->getArgOperand(0); -    while (auto *IE = dyn_cast<InsertElementInst>(Op)) { -      const Value *Index = IE->getOperand(2); -      if (!isa<ConstantInt>(Index)) -        break; -      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); - -      if (Idx == 0) { -        Op = IE->getOperand(1); -        break; -      } -      Op = IE->getOperand(0); -    } - -    unsigned Reg = getRegForValue(Op); -    if (Reg == 0) -      return false; - -    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) -      .addReg(Reg); - -    updateValueMap(II, ResultReg); -    return true; -  } -  } -} - -bool X86FastISel::fastLowerArguments() { -  if (!FuncInfo.CanLowerReturn) -    return false; - -  const Function *F = FuncInfo.Fn; -  if (F->isVarArg()) -    return false; - -  CallingConv::ID CC = F->getCallingConv(); -  if (CC != CallingConv::C) -    return false; - -  if (Subtarget->isCallingConvWin64(CC)) -    return false; - -  if (!Subtarget->is64Bit()) -    return false; - -  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. -  unsigned GPRCnt = 0; -  unsigned FPRCnt = 0; -  unsigned Idx = 0; -  for (auto const &Arg : F->args()) { -    // The first argument is at index 1. -    ++Idx; -    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || -        F->getAttributes().hasAttribute(Idx, Attribute::InReg) || -        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || -        F->getAttributes().hasAttribute(Idx, Attribute::Nest)) -      return false; - -    Type *ArgTy = Arg.getType(); -    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) -      return false; - -    EVT ArgVT = TLI.getValueType(ArgTy); -    if (!ArgVT.isSimple()) return false; -    switch (ArgVT.getSimpleVT().SimpleTy) { -    default: return false; -    case MVT::i32: -    case MVT::i64: -      ++GPRCnt; -      break; -    case MVT::f32: -    case MVT::f64: -      if (!Subtarget->hasSSE1()) -        return false; -      ++FPRCnt; -      break; -    } - -    if (GPRCnt > 6) -      return false; - -    if (FPRCnt > 8) -      return false; -  } - -  static const MCPhysReg GPR32ArgRegs[] = { -    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D -  }; -  static const MCPhysReg GPR64ArgRegs[] = { -    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 -  }; -  static const MCPhysReg XMMArgRegs[] = { -    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, -    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 -  }; - -  unsigned GPRIdx = 0; -  unsigned FPRIdx = 0; -  for (auto const &Arg : F->args()) { -    MVT VT = TLI.getSimpleValueType(Arg.getType()); -    const TargetRegisterClass *RC = TLI.getRegClassFor(VT); -    unsigned SrcReg; -    switch (VT.SimpleTy) { -    default: llvm_unreachable("Unexpected value type."); -    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; -    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; -    case MVT::f32: // fall-through -    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; -    } -    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); -    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. -    // Without this, EmitLiveInCopies may eliminate the livein if its only -    // use is a bitcast (which isn't turned into an instruction). -    unsigned ResultReg = createResultReg(RC); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), ResultReg) -      .addReg(DstReg, getKillRegState(true)); -    updateValueMap(&Arg, ResultReg); -  } -  return true; -} - -static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, -                                           CallingConv::ID CC, -                                           ImmutableCallSite *CS) { -  if (Subtarget->is64Bit()) -    return 0; -  if (Subtarget->getTargetTriple().isOSMSVCRT()) -    return 0; -  if (CC == CallingConv::Fast || CC == CallingConv::GHC || -      CC == CallingConv::HiPE) -    return 0; -  if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) -    return 0; -  if (CS && CS->paramHasAttr(1, Attribute::InReg)) -    return 0; -  return 4; -} - -bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { -  auto &OutVals       = CLI.OutVals; -  auto &OutFlags      = CLI.OutFlags; -  auto &OutRegs       = CLI.OutRegs; -  auto &Ins           = CLI.Ins; -  auto &InRegs        = CLI.InRegs; -  CallingConv::ID CC  = CLI.CallConv; -  bool &IsTailCall    = CLI.IsTailCall; -  bool IsVarArg       = CLI.IsVarArg; -  const Value *Callee = CLI.Callee; -  const char *SymName = CLI.SymName; - -  bool Is64Bit        = Subtarget->is64Bit(); -  bool IsWin64        = Subtarget->isCallingConvWin64(CC); - -  // Handle only C, fastcc, and webkit_js calling conventions for now. -  switch (CC) { -  default: return false; -  case CallingConv::C: -  case CallingConv::Fast: -  case CallingConv::WebKit_JS: -  case CallingConv::X86_FastCall: -  case CallingConv::X86_64_Win64: -  case CallingConv::X86_64_SysV: -    break; -  } - -  // Allow SelectionDAG isel to handle tail calls. -  if (IsTailCall) -    return false; - -  // fastcc with -tailcallopt is intended to provide a guaranteed -  // tail call optimization. Fastisel doesn't know how to do that. -  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) -    return false; - -  // Don't know how to handle Win64 varargs yet.  Nothing special needed for -  // x86-32. Special handling for x86-64 is implemented. -  if (IsVarArg && IsWin64) -    return false; - -  // Don't know about inalloca yet. -  if (CLI.CS && CLI.CS->hasInAllocaArgument()) -    return false; - -  // Fast-isel doesn't know about callee-pop yet. -  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, -                       TM.Options.GuaranteedTailCallOpt)) -    return false; - -  SmallVector<MVT, 16> OutVTs; -  SmallVector<unsigned, 16> ArgRegs; - -  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra -  // instruction. This is safe because it is common to all FastISel supported -  // calling conventions on x86. -  for (int i = 0, e = OutVals.size(); i != e; ++i) { -    Value *&Val = OutVals[i]; -    ISD::ArgFlagsTy Flags = OutFlags[i]; -    if (auto *CI = dyn_cast<ConstantInt>(Val)) { -      if (CI->getBitWidth() < 32) { -        if (Flags.isSExt()) -          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); -        else -          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); -      } -    } - -    // Passing bools around ends up doing a trunc to i1 and passing it. -    // Codegen this as an argument + "and 1". -    MVT VT; -    auto *TI = dyn_cast<TruncInst>(Val); -    unsigned ResultReg; -    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && -              (TI->getParent() == CLI.CS->getInstruction()->getParent()) && -              TI->hasOneUse()) { -      Value *PrevVal = TI->getOperand(0); -      ResultReg = getRegForValue(PrevVal); - -      if (!ResultReg) -        return false; - -      if (!isTypeLegal(PrevVal->getType(), VT)) -        return false; - -      ResultReg = -        fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); -    } else { -      if (!isTypeLegal(Val->getType(), VT)) -        return false; -      ResultReg = getRegForValue(Val); -    } - -    if (!ResultReg) -      return false; - -    ArgRegs.push_back(ResultReg); -    OutVTs.push_back(VT); -  } - -  // Analyze operands of the call, assigning locations to each operand. -  SmallVector<CCValAssign, 16> ArgLocs; -  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); - -  // Allocate shadow area for Win64 -  if (IsWin64) -    CCInfo.AllocateStack(32, 8); - -  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); - -  // Get a count of how many bytes are to be pushed on the stack. -  unsigned NumBytes = CCInfo.getNextStackOffset(); - -  // Issue CALLSEQ_START -  unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) -    .addImm(NumBytes); - -  // Walk the register/memloc assignments, inserting copies/loads. -  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( -      TM.getSubtargetImpl()->getRegisterInfo()); -  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { -    CCValAssign const &VA = ArgLocs[i]; -    const Value *ArgVal = OutVals[VA.getValNo()]; -    MVT ArgVT = OutVTs[VA.getValNo()]; - -    if (ArgVT == MVT::x86mmx) -      return false; - -    unsigned ArgReg = ArgRegs[VA.getValNo()]; - -    // Promote the value if needed. -    switch (VA.getLocInfo()) { -    case CCValAssign::Full: break; -    case CCValAssign::SExt: { -      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && -             "Unexpected extend"); -      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, -                                       ArgVT, ArgReg); -      assert(Emitted && "Failed to emit a sext!"); (void)Emitted; -      ArgVT = VA.getLocVT(); -      break; -    } -    case CCValAssign::ZExt: { -      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && -             "Unexpected extend"); -      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, -                                       ArgVT, ArgReg); -      assert(Emitted && "Failed to emit a zext!"); (void)Emitted; -      ArgVT = VA.getLocVT(); -      break; -    } -    case CCValAssign::AExt: { -      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && -             "Unexpected extend"); -      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, -                                       ArgVT, ArgReg); -      if (!Emitted) -        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, -                                    ArgVT, ArgReg); -      if (!Emitted) -        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, -                                    ArgVT, ArgReg); - -      assert(Emitted && "Failed to emit a aext!"); (void)Emitted; -      ArgVT = VA.getLocVT(); -      break; -    } -    case CCValAssign::BCvt: { -      ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, -                          /*TODO: Kill=*/false); -      assert(ArgReg && "Failed to emit a bitcast!"); -      ArgVT = VA.getLocVT(); -      break; -    } -    case CCValAssign::VExt: -      // VExt has not been implemented, so this should be impossible to reach -      // for now.  However, fallback to Selection DAG isel once implemented. -      return false; -    case CCValAssign::AExtUpper: -    case CCValAssign::SExtUpper: -    case CCValAssign::ZExtUpper: -    case CCValAssign::FPExt: -      llvm_unreachable("Unexpected loc info!"); -    case CCValAssign::Indirect: -      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully -      // support this. -      return false; -    } - -    if (VA.isRegLoc()) { -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); -      OutRegs.push_back(VA.getLocReg()); -    } else { -      assert(VA.isMemLoc()); - -      // Don't emit stores for undef values. -      if (isa<UndefValue>(ArgVal)) -        continue; - -      unsigned LocMemOffset = VA.getLocMemOffset(); -      X86AddressMode AM; -      AM.Base.Reg = RegInfo->getStackRegister(); -      AM.Disp = LocMemOffset; -      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; -      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); -      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( -        MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, -        ArgVT.getStoreSize(), Alignment); -      if (Flags.isByVal()) { -        X86AddressMode SrcAM; -        SrcAM.Base.Reg = ArgReg; -        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) -          return false; -      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { -        // If this is a really simple value, emit this with the Value* version -        // of X86FastEmitStore.  If it isn't simple, we don't want to do this, -        // as it can cause us to reevaluate the argument. -        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) -          return false; -      } else { -        bool ValIsKill = hasTrivialKill(ArgVal); -        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) -          return false; -      } -    } -  } - -  // ELF / PIC requires GOT in the EBX register before function calls via PLT -  // GOT pointer. -  if (Subtarget->isPICStyleGOT()) { -    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); -  } - -  if (Is64Bit && IsVarArg && !IsWin64) { -    // From AMD64 ABI document: -    // For calls that may call functions that use varargs or stdargs -    // (prototype-less calls or calls to functions containing ellipsis (...) in -    // the declaration) %al is used as hidden argument to specify the number -    // of SSE registers used. The contents of %al do not need to match exactly -    // the number of registers, but must be an ubound on the number of SSE -    // registers used and is in the range 0 - 8 inclusive. - -    // Count the number of XMM registers allocated. -    static const MCPhysReg XMMArgRegs[] = { -      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, -      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 -    }; -    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); -    assert((Subtarget->hasSSE1() || !NumXMMRegs) -           && "SSE registers cannot be used when SSE is disabled"); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), -            X86::AL).addImm(NumXMMRegs); -  } - -  // Materialize callee address in a register. FIXME: GV address can be -  // handled with a CALLpcrel32 instead. -  X86AddressMode CalleeAM; -  if (!X86SelectCallAddress(Callee, CalleeAM)) -    return false; - -  unsigned CalleeOp = 0; -  const GlobalValue *GV = nullptr; -  if (CalleeAM.GV != nullptr) { -    GV = CalleeAM.GV; -  } else if (CalleeAM.Base.Reg != 0) { -    CalleeOp = CalleeAM.Base.Reg; -  } else -    return false; - -  // Issue the call. -  MachineInstrBuilder MIB; -  if (CalleeOp) { -    // Register-indirect call. -    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; -    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) -      .addReg(CalleeOp); -  } else { -    // Direct call. -    assert(GV && "Not a direct call"); -    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; - -    // See if we need any target-specific flags on the GV operand. -    unsigned char OpFlags = 0; - -    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to -    // external symbols most go through the PLT in PIC mode.  If the symbol -    // has hidden or protected visibility, or if it is static or local, then -    // we don't need to use the PLT - we can directly call it. -    if (Subtarget->isTargetELF() && -        TM.getRelocationModel() == Reloc::PIC_ && -        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { -      OpFlags = X86II::MO_PLT; -    } else if (Subtarget->isPICStyleStubAny() && -               (GV->isDeclaration() || GV->isWeakForLinker()) && -               (!Subtarget->getTargetTriple().isMacOSX() || -                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { -      // PC-relative references to external symbols should go through $stub, -      // unless we're building with the leopard linker or later, which -      // automatically synthesizes these stubs. -      OpFlags = X86II::MO_DARWIN_STUB; -    } - -    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); -    if (SymName) -      MIB.addExternalSymbol(SymName, OpFlags); -    else -      MIB.addGlobalAddress(GV, 0, OpFlags); -  } - -  // Add a register mask operand representing the call-preserved registers. -  // Proper defs for return values will be added by setPhysRegsDeadExcept(). -  MIB.addRegMask(TRI.getCallPreservedMask(CC)); - -  // Add an implicit use GOT pointer in EBX. -  if (Subtarget->isPICStyleGOT()) -    MIB.addReg(X86::EBX, RegState::Implicit); - -  if (Is64Bit && IsVarArg && !IsWin64) -    MIB.addReg(X86::AL, RegState::Implicit); - -  // Add implicit physical register uses to the call. -  for (auto Reg : OutRegs) -    MIB.addReg(Reg, RegState::Implicit); - -  // Issue CALLSEQ_END -  unsigned NumBytesForCalleeToPop = -    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); -  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) -    .addImm(NumBytes).addImm(NumBytesForCalleeToPop); - -  // Now handle call return values. -  SmallVector<CCValAssign, 16> RVLocs; -  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, -                    CLI.RetTy->getContext()); -  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); - -  // Copy all of the result registers out of their specified physreg. -  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); -  for (unsigned i = 0; i != RVLocs.size(); ++i) { -    CCValAssign &VA = RVLocs[i]; -    EVT CopyVT = VA.getValVT(); -    unsigned CopyReg = ResultReg + i; - -    // If this is x86-64, and we disabled SSE, we can't return FP values -    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && -        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { -      report_fatal_error("SSE register return with SSE disabled"); -    } - -    // If we prefer to use the value in xmm registers, copy it out as f80 and -    // use a truncate to move it from fp stack reg to xmm reg. -    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && -        isScalarFPTypeInSSEReg(VA.getValVT())) { -      CopyVT = MVT::f80; -      CopyReg = createResultReg(&X86::RFP80RegClass); -    } - -    // Copy out the result. -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); -    InRegs.push_back(VA.getLocReg()); - -    // Round the f80 to the right size, which also moves it to the appropriate -    // xmm register. This is accomplished by storing the f80 value in memory -    // and then loading it back. -    if (CopyVT != VA.getValVT()) { -      EVT ResVT = VA.getValVT(); -      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; -      unsigned MemSize = ResVT.getSizeInBits()/8; -      int FI = MFI.CreateStackObject(MemSize, MemSize, false); -      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                                TII.get(Opc)), FI) -        .addReg(CopyReg); -      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; -      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                                TII.get(Opc), ResultReg + i), FI); -    } -  } - -  CLI.ResultReg = ResultReg; -  CLI.NumResultRegs = RVLocs.size(); -  CLI.Call = MIB; - -  return true; -} - -bool -X86FastISel::fastSelectInstruction(const Instruction *I)  { -  switch (I->getOpcode()) { -  default: break; -  case Instruction::Load: -    return X86SelectLoad(I); -  case Instruction::Store: -    return X86SelectStore(I); -  case Instruction::Ret: -    return X86SelectRet(I); -  case Instruction::ICmp: -  case Instruction::FCmp: -    return X86SelectCmp(I); -  case Instruction::ZExt: -    return X86SelectZExt(I); -  case Instruction::Br: -    return X86SelectBranch(I); -  case Instruction::LShr: -  case Instruction::AShr: -  case Instruction::Shl: -    return X86SelectShift(I); -  case Instruction::SDiv: -  case Instruction::UDiv: -  case Instruction::SRem: -  case Instruction::URem: -    return X86SelectDivRem(I); -  case Instruction::Select: -    return X86SelectSelect(I); -  case Instruction::Trunc: -    return X86SelectTrunc(I); -  case Instruction::FPExt: -    return X86SelectFPExt(I); -  case Instruction::FPTrunc: -    return X86SelectFPTrunc(I); -  case Instruction::IntToPtr: // Deliberate fall-through. -  case Instruction::PtrToInt: { -    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); -    EVT DstVT = TLI.getValueType(I->getType()); -    if (DstVT.bitsGT(SrcVT)) -      return X86SelectZExt(I); -    if (DstVT.bitsLT(SrcVT)) -      return X86SelectTrunc(I); -    unsigned Reg = getRegForValue(I->getOperand(0)); -    if (Reg == 0) return false; -    updateValueMap(I, Reg); -    return true; -  } -  } - -  return false; -} - -unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { -  if (VT > MVT::i64) -    return 0; - -  uint64_t Imm = CI->getZExtValue(); -  if (Imm == 0) { -    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); -    switch (VT.SimpleTy) { -    default: llvm_unreachable("Unexpected value type"); -    case MVT::i1: -    case MVT::i8: -      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, -                                        X86::sub_8bit); -    case MVT::i16: -      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, -                                        X86::sub_16bit); -    case MVT::i32: -      return SrcReg; -    case MVT::i64: { -      unsigned ResultReg = createResultReg(&X86::GR64RegClass); -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) -        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); -      return ResultReg; -    } -    } -  } - -  unsigned Opc = 0; -  switch (VT.SimpleTy) { -  default: llvm_unreachable("Unexpected value type"); -  case MVT::i1:  VT = MVT::i8; // fall-through -  case MVT::i8:  Opc = X86::MOV8ri;  break; -  case MVT::i16: Opc = X86::MOV16ri; break; -  case MVT::i32: Opc = X86::MOV32ri; break; -  case MVT::i64: { -    if (isUInt<32>(Imm)) -      Opc = X86::MOV32ri; -    else if (isInt<32>(Imm)) -      Opc = X86::MOV64ri32; -    else -      Opc = X86::MOV64ri; -    break; -  } -  } -  if (VT == MVT::i64 && Opc == X86::MOV32ri) { -    unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); -    unsigned ResultReg = createResultReg(&X86::GR64RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -            TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) -      .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); -    return ResultReg; -  } -  return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); -} - -unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { -  if (CFP->isNullValue()) -    return fastMaterializeFloatZero(CFP); - -  // Can't handle alternate code models yet. -  CodeModel::Model CM = TM.getCodeModel(); -  if (CM != CodeModel::Small && CM != CodeModel::Large) -    return 0; - -  // Get opcode and regclass of the output for the given load instruction. -  unsigned Opc = 0; -  const TargetRegisterClass *RC = nullptr; -  switch (VT.SimpleTy) { -  default: return 0; -  case MVT::f32: -    if (X86ScalarSSEf32) { -      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; -      RC  = &X86::FR32RegClass; -    } else { -      Opc = X86::LD_Fp32m; -      RC  = &X86::RFP32RegClass; -    } -    break; -  case MVT::f64: -    if (X86ScalarSSEf64) { -      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; -      RC  = &X86::FR64RegClass; -    } else { -      Opc = X86::LD_Fp64m; -      RC  = &X86::RFP64RegClass; -    } -    break; -  case MVT::f80: -    // No f80 support yet. -    return 0; -  } - -  // MachineConstantPool wants an explicit alignment. -  unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); -  if (Align == 0) { -    // Alignment of vector types. FIXME! -    Align = DL.getTypeAllocSize(CFP->getType()); -  } - -  // x86-32 PIC requires a PIC base register for constant pools. -  unsigned PICBase = 0; -  unsigned char OpFlag = 0; -  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic -    OpFlag = X86II::MO_PIC_BASE_OFFSET; -    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); -  } else if (Subtarget->isPICStyleGOT()) { -    OpFlag = X86II::MO_GOTOFF; -    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); -  } else if (Subtarget->isPICStyleRIPRel() && -             TM.getCodeModel() == CodeModel::Small) { -    PICBase = X86::RIP; -  } - -  // Create the load from the constant pool. -  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); -  unsigned ResultReg = createResultReg(RC); - -  if (CM == CodeModel::Large) { -    unsigned AddrReg = createResultReg(&X86::GR64RegClass); -    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), -            AddrReg) -      .addConstantPoolIndex(CPI, 0, OpFlag); -    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                                      TII.get(Opc), ResultReg); -    addDirectMem(MIB, AddrReg); -    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( -        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, -        TM.getDataLayout()->getPointerSize(), Align); -    MIB->addMemOperand(*FuncInfo.MF, MMO); -    return ResultReg; -  } - -  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                                   TII.get(Opc), ResultReg), -                           CPI, PICBase, OpFlag); -  return ResultReg; -} - -unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { -  // Can't handle alternate code models yet. -  if (TM.getCodeModel() != CodeModel::Small) -    return 0; - -  // Materialize addresses with LEA/MOV instructions. -  X86AddressMode AM; -  if (X86SelectAddress(GV, AM)) { -    // If the expression is just a basereg, then we're done, otherwise we need -    // to emit an LEA. -    if (AM.BaseType == X86AddressMode::RegBase && -        AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) -      return AM.Base.Reg; - -    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); -    if (TM.getRelocationModel() == Reloc::Static && -        TLI.getPointerTy() == MVT::i64) { -      // The displacement code could be more than 32 bits away so we need to use -      // an instruction with a 64 bit immediate -      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), -              ResultReg) -        .addGlobalAddress(GV); -    } else { -      unsigned Opc = TLI.getPointerTy() == MVT::i32 -                     ? (Subtarget->isTarget64BitILP32() -                        ? X86::LEA64_32r : X86::LEA32r) -                     : X86::LEA64r; -      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                             TII.get(Opc), ResultReg), AM); -    } -    return ResultReg; -  } -  return 0; -} - -unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { -  EVT CEVT = TLI.getValueType(C->getType(), true); - -  // Only handle simple types. -  if (!CEVT.isSimple()) -    return 0; -  MVT VT = CEVT.getSimpleVT(); - -  if (const auto *CI = dyn_cast<ConstantInt>(C)) -    return X86MaterializeInt(CI, VT); -  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) -    return X86MaterializeFP(CFP, VT); -  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) -    return X86MaterializeGV(GV, VT); - -  return 0; -} - -unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { -  // Fail on dynamic allocas. At this point, getRegForValue has already -  // checked its CSE maps, so if we're here trying to handle a dynamic -  // alloca, we're not going to succeed. X86SelectAddress has a -  // check for dynamic allocas, because it's called directly from -  // various places, but targetMaterializeAlloca also needs a check -  // in order to avoid recursion between getRegForValue, -  // X86SelectAddrss, and targetMaterializeAlloca. -  if (!FuncInfo.StaticAllocaMap.count(C)) -    return 0; -  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); - -  X86AddressMode AM; -  if (!X86SelectAddress(C, AM)) -    return 0; -  unsigned Opc = TLI.getPointerTy() == MVT::i32 -                 ? (Subtarget->isTarget64BitILP32() -                    ? X86::LEA64_32r : X86::LEA32r) -                 : X86::LEA64r; -  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); -  unsigned ResultReg = createResultReg(RC); -  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, -                         TII.get(Opc), ResultReg), AM); -  return ResultReg; -} - -unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { -  MVT VT; -  if (!isTypeLegal(CF->getType(), VT)) -    return 0; - -  // Get opcode and regclass for the given zero. -  unsigned Opc = 0; -  const TargetRegisterClass *RC = nullptr; -  switch (VT.SimpleTy) { -  default: return 0; -  case MVT::f32: -    if (X86ScalarSSEf32) { -      Opc = X86::FsFLD0SS; -      RC  = &X86::FR32RegClass; -    } else { -      Opc = X86::LD_Fp032; -      RC  = &X86::RFP32RegClass; -    } -    break; -  case MVT::f64: -    if (X86ScalarSSEf64) { -      Opc = X86::FsFLD0SD; -      RC  = &X86::FR64RegClass; -    } else { -      Opc = X86::LD_Fp064; -      RC  = &X86::RFP64RegClass; -    } -    break; -  case MVT::f80: -    // No f80 support yet. -    return 0; -  } - -  unsigned ResultReg = createResultReg(RC); -  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); -  return ResultReg; -} - - -bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, -                                      const LoadInst *LI) { -  const Value *Ptr = LI->getPointerOperand(); -  X86AddressMode AM; -  if (!X86SelectAddress(Ptr, AM)) -    return false; - -  const X86InstrInfo &XII = (const X86InstrInfo &)TII; - -  unsigned Size = DL.getTypeAllocSize(LI->getType()); -  unsigned Alignment = LI->getAlignment(); - -  if (Alignment == 0)  // Ensure that codegen never sees alignment 0 -    Alignment = DL.getABITypeAlignment(LI->getType()); - -  SmallVector<MachineOperand, 8> AddrOps; -  AM.getFullAddress(AddrOps); - -  MachineInstr *Result = -    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, -                              Size, Alignment, /*AllowCommute=*/true); -  if (!Result) -    return false; - -  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); -  FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); -  MI->eraseFromParent(); -  return true; -} - - -namespace llvm { -  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, -                                const TargetLibraryInfo *libInfo) { -    return new X86FastISel(funcInfo, libInfo); -  } -} +//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file defines the X86-specific support for the FastISel class. Much
 +// of the target-specific code is generated by tablegen in the file
 +// X86GenFastISel.inc, which is #included here.
 +//
 +//===----------------------------------------------------------------------===//
 +
 +#include "X86.h"
 +#include "X86CallingConv.h"
 +#include "X86InstrBuilder.h"
 +#include "X86InstrInfo.h"
 +#include "X86MachineFunctionInfo.h"
 +#include "X86RegisterInfo.h"
 +#include "X86Subtarget.h"
 +#include "X86TargetMachine.h"
 +#include "llvm/Analysis/BranchProbabilityInfo.h"
 +#include "llvm/CodeGen/Analysis.h"
 +#include "llvm/CodeGen/FastISel.h"
 +#include "llvm/CodeGen/FunctionLoweringInfo.h"
 +#include "llvm/CodeGen/MachineConstantPool.h"
 +#include "llvm/CodeGen/MachineFrameInfo.h"
 +#include "llvm/CodeGen/MachineRegisterInfo.h"
 +#include "llvm/IR/CallSite.h"
 +#include "llvm/IR/CallingConv.h"
 +#include "llvm/IR/DerivedTypes.h"
 +#include "llvm/IR/GetElementPtrTypeIterator.h"
 +#include "llvm/IR/GlobalAlias.h"
 +#include "llvm/IR/GlobalVariable.h"
 +#include "llvm/IR/Instructions.h"
 +#include "llvm/IR/IntrinsicInst.h"
 +#include "llvm/IR/Operator.h"
 +#include "llvm/Support/ErrorHandling.h"
 +#include "llvm/Target/TargetOptions.h"
 +using namespace llvm;
 +
 +namespace {
 +
 +class X86FastISel final : public FastISel {
 +  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
 +  /// make the right decision when generating code for different targets.
 +  const X86Subtarget *Subtarget;
 +
 +  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
 +  /// floating point ops.
 +  /// When SSE is available, use it for f32 operations.
 +  /// When SSE2 is available, use it for f64 operations.
 +  bool X86ScalarSSEf64;
 +  bool X86ScalarSSEf32;
 +
 +public:
 +  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
 +                       const TargetLibraryInfo *libInfo)
 +    : FastISel(funcInfo, libInfo) {
 +    Subtarget = &TM.getSubtarget<X86Subtarget>();
 +    X86ScalarSSEf64 = Subtarget->hasSSE2();
 +    X86ScalarSSEf32 = Subtarget->hasSSE1();
 +  }
 +
 +  bool fastSelectInstruction(const Instruction *I) override;
 +
 +  /// \brief The specified machine instr operand is a vreg, and that
 +  /// vreg is being provided by the specified load instruction.  If possible,
 +  /// try to fold the load as an operand to the instruction, returning true if
 +  /// possible.
 +  bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 +                           const LoadInst *LI) override;
 +
 +  bool fastLowerArguments() override;
 +  bool fastLowerCall(CallLoweringInfo &CLI) override;
 +  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
 +
 +#include "X86GenFastISel.inc"
 +
 +private:
 +  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
 +
 +  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
 +                       unsigned &ResultReg);
 +
 +  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
 +                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
 +  bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
 +                        const X86AddressMode &AM,
 +                        MachineMemOperand *MMO = nullptr, bool Aligned = false);
 +
 +  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
 +                         unsigned &ResultReg);
 +
 +  bool X86SelectAddress(const Value *V, X86AddressMode &AM);
 +  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
 +
 +  bool X86SelectLoad(const Instruction *I);
 +
 +  bool X86SelectStore(const Instruction *I);
 +
 +  bool X86SelectRet(const Instruction *I);
 +
 +  bool X86SelectCmp(const Instruction *I);
 +
 +  bool X86SelectZExt(const Instruction *I);
 +
 +  bool X86SelectBranch(const Instruction *I);
 +
 +  bool X86SelectShift(const Instruction *I);
 +
 +  bool X86SelectDivRem(const Instruction *I);
 +
 +  bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
 +
 +  bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
 +
 +  bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
 +
 +  bool X86SelectSelect(const Instruction *I);
 +
 +  bool X86SelectTrunc(const Instruction *I);
 +
 +  bool X86SelectFPExt(const Instruction *I);
 +  bool X86SelectFPTrunc(const Instruction *I);
 +
 +  const X86InstrInfo *getInstrInfo() const {
 +    return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
 +  }
 +  const X86TargetMachine *getTargetMachine() const {
 +    return static_cast<const X86TargetMachine *>(&TM);
 +  }
 +
 +  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
 +
 +  unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
 +  unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
 +  unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
 +  unsigned fastMaterializeConstant(const Constant *C) override;
 +
 +  unsigned fastMaterializeAlloca(const AllocaInst *C) override;
 +
 +  unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
 +
 +  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
 +  /// computed in an SSE register, not on the X87 floating point stack.
 +  bool isScalarFPTypeInSSEReg(EVT VT) const {
 +    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
 +      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
 +  }
 +
 +  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
 +
 +  bool IsMemcpySmall(uint64_t Len);
 +
 +  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
 +                          X86AddressMode SrcAM, uint64_t Len);
 +
 +  bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 +                            const Value *Cond);
 +};
 +
 +} // end anonymous namespace.
 +
 +static std::pair<X86::CondCode, bool>
 +getX86ConditionCode(CmpInst::Predicate Predicate) {
 +  X86::CondCode CC = X86::COND_INVALID;
 +  bool NeedSwap = false;
 +  switch (Predicate) {
 +  default: break;
 +  // Floating-point Predicates
 +  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
 +  case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
 +  case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
 +  case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
 +  case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
 +  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
 +  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
 +  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
 +  case CmpInst::FCMP_OEQ: // fall-through
 +  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
 +
 +  // Integer Predicates
 +  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
 +  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
 +  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
 +  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
 +  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
 +  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
 +  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
 +  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
 +  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
 +  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
 +  }
 +
 +  return std::make_pair(CC, NeedSwap);
 +}
 +
 +static std::pair<unsigned, bool>
 +getX86SSEConditionCode(CmpInst::Predicate Predicate) {
 +  unsigned CC;
 +  bool NeedSwap = false;
 +
 +  // SSE Condition code mapping:
 +  //  0 - EQ
 +  //  1 - LT
 +  //  2 - LE
 +  //  3 - UNORD
 +  //  4 - NEQ
 +  //  5 - NLT
 +  //  6 - NLE
 +  //  7 - ORD
 +  switch (Predicate) {
 +  default: llvm_unreachable("Unexpected predicate");
 +  case CmpInst::FCMP_OEQ: CC = 0;          break;
 +  case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_OLT: CC = 1;          break;
 +  case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_OLE: CC = 2;          break;
 +  case CmpInst::FCMP_UNO: CC = 3;          break;
 +  case CmpInst::FCMP_UNE: CC = 4;          break;
 +  case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_UGE: CC = 5;          break;
 +  case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
 +  case CmpInst::FCMP_UGT: CC = 6;          break;
 +  case CmpInst::FCMP_ORD: CC = 7;          break;
 +  case CmpInst::FCMP_UEQ:
 +  case CmpInst::FCMP_ONE: CC = 8;          break;
 +  }
 +
 +  return std::make_pair(CC, NeedSwap);
 +}
 +
 +/// \brief Check if it is possible to fold the condition from the XALU intrinsic
 +/// into the user. The condition code will only be updated on success.
 +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
 +                                       const Value *Cond) {
 +  if (!isa<ExtractValueInst>(Cond))
 +    return false;
 +
 +  const auto *EV = cast<ExtractValueInst>(Cond);
 +  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
 +    return false;
 +
 +  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
 +  MVT RetVT;
 +  const Function *Callee = II->getCalledFunction();
 +  Type *RetTy =
 +    cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
 +  if (!isTypeLegal(RetTy, RetVT))
 +    return false;
 +
 +  if (RetVT != MVT::i32 && RetVT != MVT::i64)
 +    return false;
 +
 +  X86::CondCode TmpCC;
 +  switch (II->getIntrinsicID()) {
 +  default: return false;
 +  case Intrinsic::sadd_with_overflow:
 +  case Intrinsic::ssub_with_overflow:
 +  case Intrinsic::smul_with_overflow:
 +  case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
 +  case Intrinsic::uadd_with_overflow:
 +  case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
 +  }
 +
 +  // Check if both instructions are in the same basic block.
 +  if (II->getParent() != I->getParent())
 +    return false;
 +
 +  // Make sure nothing is in the way
 +  BasicBlock::const_iterator Start = I;
 +  BasicBlock::const_iterator End = II;
 +  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
 +    // We only expect extractvalue instructions between the intrinsic and the
 +    // instruction to be selected.
 +    if (!isa<ExtractValueInst>(Itr))
 +      return false;
 +
 +    // Check that the extractvalue operand comes from the intrinsic.
 +    const auto *EVI = cast<ExtractValueInst>(Itr);
 +    if (EVI->getAggregateOperand() != II)
 +      return false;
 +  }
 +
 +  CC = TmpCC;
 +  return true;
 +}
 +
 +bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
 +  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
 +  if (evt == MVT::Other || !evt.isSimple())
 +    // Unhandled type. Halt "fast" selection and bail.
 +    return false;
 +
 +  VT = evt.getSimpleVT();
 +  // For now, require SSE/SSE2 for performing floating-point operations,
 +  // since x87 requires additional work.
 +  if (VT == MVT::f64 && !X86ScalarSSEf64)
 +    return false;
 +  if (VT == MVT::f32 && !X86ScalarSSEf32)
 +    return false;
 +  // Similarly, no f80 support yet.
 +  if (VT == MVT::f80)
 +    return false;
 +  // We only handle legal types. For example, on x86-32 the instruction
 +  // selector contains all of the 64-bit instructions from x86-64,
 +  // under the assumption that i64 won't be used if the target doesn't
 +  // support it.
 +  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
 +}
 +
 +#include "X86GenCallingConv.inc"
 +
 +/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
 +/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 +/// Return true and the result register by reference if it is possible.
 +bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 +                                  MachineMemOperand *MMO, unsigned &ResultReg) {
 +  // Get opcode and regclass of the output for the given load instruction.
 +  unsigned Opc = 0;
 +  const TargetRegisterClass *RC = nullptr;
 +  switch (VT.getSimpleVT().SimpleTy) {
 +  default: return false;
 +  case MVT::i1:
 +  case MVT::i8:
 +    Opc = X86::MOV8rm;
 +    RC  = &X86::GR8RegClass;
 +    break;
 +  case MVT::i16:
 +    Opc = X86::MOV16rm;
 +    RC  = &X86::GR16RegClass;
 +    break;
 +  case MVT::i32:
 +    Opc = X86::MOV32rm;
 +    RC  = &X86::GR32RegClass;
 +    break;
 +  case MVT::i64:
 +    // Must be in x86-64 mode.
 +    Opc = X86::MOV64rm;
 +    RC  = &X86::GR64RegClass;
 +    break;
 +  case MVT::f32:
 +    if (X86ScalarSSEf32) {
 +      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
 +      RC  = &X86::FR32RegClass;
 +    } else {
 +      Opc = X86::LD_Fp32m;
 +      RC  = &X86::RFP32RegClass;
 +    }
 +    break;
 +  case MVT::f64:
 +    if (X86ScalarSSEf64) {
 +      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
 +      RC  = &X86::FR64RegClass;
 +    } else {
 +      Opc = X86::LD_Fp64m;
 +      RC  = &X86::RFP64RegClass;
 +    }
 +    break;
 +  case MVT::f80:
 +    // No f80 support yet.
 +    return false;
 +  }
 +
 +  ResultReg = createResultReg(RC);
 +  MachineInstrBuilder MIB =
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
 +  addFullAddress(MIB, AM);
 +  if (MMO)
 +    MIB->addMemOperand(*FuncInfo.MF, MMO);
 +  return true;
 +}
 +
 +/// X86FastEmitStore - Emit a machine instruction to store a value Val of
 +/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
 +/// and a displacement offset, or a GlobalAddress,
 +/// i.e. V. Return true if it is possible.
 +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
 +                                   const X86AddressMode &AM,
 +                                   MachineMemOperand *MMO, bool Aligned) {
 +  // Get opcode and regclass of the output for the given store instruction.
 +  unsigned Opc = 0;
 +  switch (VT.getSimpleVT().SimpleTy) {
 +  case MVT::f80: // No f80 support yet.
 +  default: return false;
 +  case MVT::i1: {
 +    // Mask out all but lowest bit.
 +    unsigned AndResult = createResultReg(&X86::GR8RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(X86::AND8ri), AndResult)
 +      .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
 +    ValReg = AndResult;
 +  }
 +  // FALLTHROUGH, handling i1 as i8.
 +  case MVT::i8:  Opc = X86::MOV8mr;  break;
 +  case MVT::i16: Opc = X86::MOV16mr; break;
 +  case MVT::i32: Opc = X86::MOV32mr; break;
 +  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
 +  case MVT::f32:
 +    Opc = X86ScalarSSEf32 ?
 +          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
 +    break;
 +  case MVT::f64:
 +    Opc = X86ScalarSSEf64 ?
 +          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
 +    break;
 +  case MVT::v4f32:
 +    if (Aligned)
 +      Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
 +    else
 +      Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
 +    break;
 +  case MVT::v2f64:
 +    if (Aligned)
 +      Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
 +    else
 +      Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
 +    break;
 +  case MVT::v4i32:
 +  case MVT::v2i64:
 +  case MVT::v8i16:
 +  case MVT::v16i8:
 +    if (Aligned)
 +      Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
 +    else
 +      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
 +    break;
 +  }
 +
 +  MachineInstrBuilder MIB =
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
 +  addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
 +  if (MMO)
 +    MIB->addMemOperand(*FuncInfo.MF, MMO);
 +
 +  return true;
 +}
 +
 +bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
 +                                   const X86AddressMode &AM,
 +                                   MachineMemOperand *MMO, bool Aligned) {
 +  // Handle 'null' like i32/i64 0.
 +  if (isa<ConstantPointerNull>(Val))
 +    Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
 +
 +  // If this is a store of a simple constant, fold the constant into the store.
 +  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
 +    unsigned Opc = 0;
 +    bool Signed = true;
 +    switch (VT.getSimpleVT().SimpleTy) {
 +    default: break;
 +    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
 +    case MVT::i8:  Opc = X86::MOV8mi;  break;
 +    case MVT::i16: Opc = X86::MOV16mi; break;
 +    case MVT::i32: Opc = X86::MOV32mi; break;
 +    case MVT::i64:
 +      // Must be a 32-bit sign extended value.
 +      if (isInt<32>(CI->getSExtValue()))
 +        Opc = X86::MOV64mi32;
 +      break;
 +    }
 +
 +    if (Opc) {
 +      MachineInstrBuilder MIB =
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
 +      addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
 +                                            : CI->getZExtValue());
 +      if (MMO)
 +        MIB->addMemOperand(*FuncInfo.MF, MMO);
 +      return true;
 +    }
 +  }
 +
 +  unsigned ValReg = getRegForValue(Val);
 +  if (ValReg == 0)
 +    return false;
 +
 +  bool ValKill = hasTrivialKill(Val);
 +  return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
 +}
 +
 +/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
 +/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
 +/// ISD::SIGN_EXTEND).
 +bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
 +                                    unsigned Src, EVT SrcVT,
 +                                    unsigned &ResultReg) {
 +  unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
 +                           Src, /*TODO: Kill=*/false);
 +  if (RR == 0)
 +    return false;
 +
 +  ResultReg = RR;
 +  return true;
 +}
 +
 +bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 +  // Handle constant address.
 +  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
 +    // Can't handle alternate code models yet.
 +    if (TM.getCodeModel() != CodeModel::Small)
 +      return false;
 +
 +    // Can't handle TLS yet.
 +    if (GV->isThreadLocal())
 +      return false;
 +
 +    // RIP-relative addresses can't have additional register operands, so if
 +    // we've already folded stuff into the addressing mode, just force the
 +    // global value into its own register, which we can use as the basereg.
 +    if (!Subtarget->isPICStyleRIPRel() ||
 +        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
 +      // Okay, we've committed to selecting this global. Set up the address.
 +      AM.GV = GV;
 +
 +      // Allow the subtarget to classify the global.
 +      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 +
 +      // If this reference is relative to the pic base, set it now.
 +      if (isGlobalRelativeToPICBase(GVFlags)) {
 +        // FIXME: How do we know Base.Reg is free??
 +        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
 +      }
 +
 +      // Unless the ABI requires an extra load, return a direct reference to
 +      // the global.
 +      if (!isGlobalStubReference(GVFlags)) {
 +        if (Subtarget->isPICStyleRIPRel()) {
 +          // Use rip-relative addressing if we can.  Above we verified that the
 +          // base and index registers are unused.
 +          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
 +          AM.Base.Reg = X86::RIP;
 +        }
 +        AM.GVOpFlags = GVFlags;
 +        return true;
 +      }
 +
 +      // Ok, we need to do a load from a stub.  If we've already loaded from
 +      // this stub, reuse the loaded pointer, otherwise emit the load now.
 +      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
 +      unsigned LoadReg;
 +      if (I != LocalValueMap.end() && I->second != 0) {
 +        LoadReg = I->second;
 +      } else {
 +        // Issue load from stub.
 +        unsigned Opc = 0;
 +        const TargetRegisterClass *RC = nullptr;
 +        X86AddressMode StubAM;
 +        StubAM.Base.Reg = AM.Base.Reg;
 +        StubAM.GV = GV;
 +        StubAM.GVOpFlags = GVFlags;
 +
 +        // Prepare for inserting code in the local-value area.
 +        SavePoint SaveInsertPt = enterLocalValueArea();
 +
 +        if (TLI.getPointerTy() == MVT::i64) {
 +          Opc = X86::MOV64rm;
 +          RC  = &X86::GR64RegClass;
 +
 +          if (Subtarget->isPICStyleRIPRel())
 +            StubAM.Base.Reg = X86::RIP;
 +        } else {
 +          Opc = X86::MOV32rm;
 +          RC  = &X86::GR32RegClass;
 +        }
 +
 +        LoadReg = createResultReg(RC);
 +        MachineInstrBuilder LoadMI =
 +          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
 +        addFullAddress(LoadMI, StubAM);
 +
 +        // Ok, back to normal mode.
 +        leaveLocalValueArea(SaveInsertPt);
 +
 +        // Prevent loading GV stub multiple times in same MBB.
 +        LocalValueMap[V] = LoadReg;
 +      }
 +
 +      // Now construct the final address. Note that the Disp, Scale,
 +      // and Index values may already be set here.
 +      AM.Base.Reg = LoadReg;
 +      AM.GV = nullptr;
 +      return true;
 +    }
 +  }
 +
 +  // If all else fails, try to materialize the value in a register.
 +  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
 +    if (AM.Base.Reg == 0) {
 +      AM.Base.Reg = getRegForValue(V);
 +      return AM.Base.Reg != 0;
 +    }
 +    if (AM.IndexReg == 0) {
 +      assert(AM.Scale == 1 && "Scale with no index!");
 +      AM.IndexReg = getRegForValue(V);
 +      return AM.IndexReg != 0;
 +    }
 +  }
 +
 +  return false;
 +}
 +
 +/// X86SelectAddress - Attempt to fill in an address from the given value.
 +///
 +bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 +  SmallVector<const Value *, 32> GEPs;
 +redo_gep:
 +  const User *U = nullptr;
 +  unsigned Opcode = Instruction::UserOp1;
 +  if (const Instruction *I = dyn_cast<Instruction>(V)) {
 +    // Don't walk into other basic blocks; it's possible we haven't
 +    // visited them yet, so the instructions may not yet be assigned
 +    // virtual registers.
 +    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
 +        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
 +      Opcode = I->getOpcode();
 +      U = I;
 +    }
 +  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
 +    Opcode = C->getOpcode();
 +    U = C;
 +  }
 +
 +  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
 +    if (Ty->getAddressSpace() > 255)
 +      // Fast instruction selection doesn't support the special
 +      // address spaces.
 +      return false;
 +
 +  switch (Opcode) {
 +  default: break;
 +  case Instruction::BitCast:
 +    // Look past bitcasts.
 +    return X86SelectAddress(U->getOperand(0), AM);
 +
 +  case Instruction::IntToPtr:
 +    // Look past no-op inttoptrs.
 +    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
 +      return X86SelectAddress(U->getOperand(0), AM);
 +    break;
 +
 +  case Instruction::PtrToInt:
 +    // Look past no-op ptrtoints.
 +    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
 +      return X86SelectAddress(U->getOperand(0), AM);
 +    break;
 +
 +  case Instruction::Alloca: {
 +    // Do static allocas.
 +    const AllocaInst *A = cast<AllocaInst>(V);
 +    DenseMap<const AllocaInst *, int>::iterator SI =
 +      FuncInfo.StaticAllocaMap.find(A);
 +    if (SI != FuncInfo.StaticAllocaMap.end()) {
 +      AM.BaseType = X86AddressMode::FrameIndexBase;
 +      AM.Base.FrameIndex = SI->second;
 +      return true;
 +    }
 +    break;
 +  }
 +
 +  case Instruction::Add: {
 +    // Adds of constants are common and easy enough.
 +    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
 +      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
 +      // They have to fit in the 32-bit signed displacement field though.
 +      if (isInt<32>(Disp)) {
 +        AM.Disp = (uint32_t)Disp;
 +        return X86SelectAddress(U->getOperand(0), AM);
 +      }
 +    }
 +    break;
 +  }
 +
 +  case Instruction::GetElementPtr: {
 +    X86AddressMode SavedAM = AM;
 +
 +    // Pattern-match simple GEPs.
 +    uint64_t Disp = (int32_t)AM.Disp;
 +    unsigned IndexReg = AM.IndexReg;
 +    unsigned Scale = AM.Scale;
 +    gep_type_iterator GTI = gep_type_begin(U);
 +    // Iterate through the indices, folding what we can. Constants can be
 +    // folded, and one dynamic index can be handled, if the scale is supported.
 +    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
 +         i != e; ++i, ++GTI) {
 +      const Value *Op = *i;
 +      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
 +        const StructLayout *SL = DL.getStructLayout(STy);
 +        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
 +        continue;
 +      }
 +
 +      // A array/variable index is always of the form i*S where S is the
 +      // constant scale size.  See if we can push the scale into immediates.
 +      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
 +      for (;;) {
 +        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
 +          // Constant-offset addressing.
 +          Disp += CI->getSExtValue() * S;
 +          break;
 +        }
 +        if (canFoldAddIntoGEP(U, Op)) {
 +          // A compatible add with a constant operand. Fold the constant.
 +          ConstantInt *CI =
 +            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
 +          Disp += CI->getSExtValue() * S;
 +          // Iterate on the other operand.
 +          Op = cast<AddOperator>(Op)->getOperand(0);
 +          continue;
 +        }
 +        if (IndexReg == 0 &&
 +            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
 +            (S == 1 || S == 2 || S == 4 || S == 8)) {
 +          // Scaled-index addressing.
 +          Scale = S;
 +          IndexReg = getRegForGEPIndex(Op).first;
 +          if (IndexReg == 0)
 +            return false;
 +          break;
 +        }
 +        // Unsupported.
 +        goto unsupported_gep;
 +      }
 +    }
 +
 +    // Check for displacement overflow.
 +    if (!isInt<32>(Disp))
 +      break;
 +
 +    AM.IndexReg = IndexReg;
 +    AM.Scale = Scale;
 +    AM.Disp = (uint32_t)Disp;
 +    GEPs.push_back(V);
 +
 +    if (const GetElementPtrInst *GEP =
 +          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
 +      // Ok, the GEP indices were covered by constant-offset and scaled-index
 +      // addressing. Update the address state and move on to examining the base.
 +      V = GEP;
 +      goto redo_gep;
 +    } else if (X86SelectAddress(U->getOperand(0), AM)) {
 +      return true;
 +    }
 +
 +    // If we couldn't merge the gep value into this addr mode, revert back to
 +    // our address and just match the value instead of completely failing.
 +    AM = SavedAM;
 +
 +    for (SmallVectorImpl<const Value *>::reverse_iterator
 +           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
 +      if (handleConstantAddresses(*I, AM))
 +        return true;
 +
 +    return false;
 +  unsupported_gep:
 +    // Ok, the GEP indices weren't all covered.
 +    break;
 +  }
 +  }
 +
 +  return handleConstantAddresses(V, AM);
 +}
 +
 +/// X86SelectCallAddress - Attempt to fill in an address from the given value.
 +///
 +bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
 +  const User *U = nullptr;
 +  unsigned Opcode = Instruction::UserOp1;
 +  const Instruction *I = dyn_cast<Instruction>(V);
 +  // Record if the value is defined in the same basic block.
 +  //
 +  // This information is crucial to know whether or not folding an
 +  // operand is valid.
 +  // Indeed, FastISel generates or reuses a virtual register for all
 +  // operands of all instructions it selects. Obviously, the definition and
 +  // its uses must use the same virtual register otherwise the produced
 +  // code is incorrect.
 +  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
 +  // registers for values that are alive across basic blocks. This ensures
 +  // that the values are consistently set between across basic block, even
 +  // if different instruction selection mechanisms are used (e.g., a mix of
 +  // SDISel and FastISel).
 +  // For values local to a basic block, the instruction selection process
 +  // generates these virtual registers with whatever method is appropriate
 +  // for its needs. In particular, FastISel and SDISel do not share the way
 +  // local virtual registers are set.
 +  // Therefore, this is impossible (or at least unsafe) to share values
 +  // between basic blocks unless they use the same instruction selection
 +  // method, which is not guarantee for X86.
 +  // Moreover, things like hasOneUse could not be used accurately, if we
 +  // allow to reference values across basic blocks whereas they are not
 +  // alive across basic blocks initially.
 +  bool InMBB = true;
 +  if (I) {
 +    Opcode = I->getOpcode();
 +    U = I;
 +    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
 +  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
 +    Opcode = C->getOpcode();
 +    U = C;
 +  }
 +
 +  switch (Opcode) {
 +  default: break;
 +  case Instruction::BitCast:
 +    // Look past bitcasts if its operand is in the same BB.
 +    if (InMBB)
 +      return X86SelectCallAddress(U->getOperand(0), AM);
 +    break;
 +
 +  case Instruction::IntToPtr:
 +    // Look past no-op inttoptrs if its operand is in the same BB.
 +    if (InMBB &&
 +        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
 +      return X86SelectCallAddress(U->getOperand(0), AM);
 +    break;
 +
 +  case Instruction::PtrToInt:
 +    // Look past no-op ptrtoints if its operand is in the same BB.
 +    if (InMBB &&
 +        TLI.getValueType(U->getType()) == TLI.getPointerTy())
 +      return X86SelectCallAddress(U->getOperand(0), AM);
 +    break;
 +  }
 +
 +  // Handle constant address.
 +  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
 +    // Can't handle alternate code models yet.
 +    if (TM.getCodeModel() != CodeModel::Small)
 +      return false;
 +
 +    // RIP-relative addresses can't have additional register operands.
 +    if (Subtarget->isPICStyleRIPRel() &&
 +        (AM.Base.Reg != 0 || AM.IndexReg != 0))
 +      return false;
 +
 +    // Can't handle DLL Import.
 +    if (GV->hasDLLImportStorageClass())
 +      return false;
 +
 +    // Can't handle TLS.
 +    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
 +      if (GVar->isThreadLocal())
 +        return false;
 +
 +    // Okay, we've committed to selecting this global. Set up the basic address.
 +    AM.GV = GV;
 +
 +    // No ABI requires an extra load for anything other than DLLImport, which
 +    // we rejected above. Return a direct reference to the global.
 +    if (Subtarget->isPICStyleRIPRel()) {
 +      // Use rip-relative addressing if we can.  Above we verified that the
 +      // base and index registers are unused.
 +      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
 +      AM.Base.Reg = X86::RIP;
 +    } else if (Subtarget->isPICStyleStubPIC()) {
 +      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
 +    } else if (Subtarget->isPICStyleGOT()) {
 +      AM.GVOpFlags = X86II::MO_GOTOFF;
 +    }
 +
 +    return true;
 +  }
 +
 +  // If all else fails, try to materialize the value in a register.
 +  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
 +    if (AM.Base.Reg == 0) {
 +      AM.Base.Reg = getRegForValue(V);
 +      return AM.Base.Reg != 0;
 +    }
 +    if (AM.IndexReg == 0) {
 +      assert(AM.Scale == 1 && "Scale with no index!");
 +      AM.IndexReg = getRegForValue(V);
 +      return AM.IndexReg != 0;
 +    }
 +  }
 +
 +  return false;
 +}
 +
 +
 +/// X86SelectStore - Select and emit code to implement store instructions.
 +bool X86FastISel::X86SelectStore(const Instruction *I) {
 +  // Atomic stores need special handling.
 +  const StoreInst *S = cast<StoreInst>(I);
 +
 +  if (S->isAtomic())
 +    return false;
 +
 +  const Value *Val = S->getValueOperand();
 +  const Value *Ptr = S->getPointerOperand();
 +
 +  MVT VT;
 +  if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
 +    return false;
 +
 +  unsigned Alignment = S->getAlignment();
 +  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
 +  if (Alignment == 0) // Ensure that codegen never sees alignment 0
 +    Alignment = ABIAlignment;
 +  bool Aligned = Alignment >= ABIAlignment;
 +
 +  X86AddressMode AM;
 +  if (!X86SelectAddress(Ptr, AM))
 +    return false;
 +
 +  return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
 +}
 +
 +/// X86SelectRet - Select and emit code to implement ret instructions.
 +bool X86FastISel::X86SelectRet(const Instruction *I) {
 +  const ReturnInst *Ret = cast<ReturnInst>(I);
 +  const Function &F = *I->getParent()->getParent();
 +  const X86MachineFunctionInfo *X86MFInfo =
 +      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
 +
 +  if (!FuncInfo.CanLowerReturn)
 +    return false;
 +
 +  CallingConv::ID CC = F.getCallingConv();
 +  if (CC != CallingConv::C &&
 +      CC != CallingConv::Fast &&
 +      CC != CallingConv::X86_FastCall &&
 +      CC != CallingConv::X86_64_SysV)
 +    return false;
 +
 +  if (Subtarget->isCallingConvWin64(CC))
 +    return false;
 +
 +  // Don't handle popping bytes on return for now.
 +  if (X86MFInfo->getBytesToPopOnReturn() != 0)
 +    return false;
 +
 +  // fastcc with -tailcallopt is intended to provide a guaranteed
 +  // tail call optimization. Fastisel doesn't know how to do that.
 +  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
 +    return false;
 +
 +  // Let SDISel handle vararg functions.
 +  if (F.isVarArg())
 +    return false;
 +
 +  // Build a list of return value registers.
 +  SmallVector<unsigned, 4> RetRegs;
 +
 +  if (Ret->getNumOperands() > 0) {
 +    SmallVector<ISD::OutputArg, 4> Outs;
 +    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
 +
 +    // Analyze operands of the call, assigning locations to each operand.
 +    SmallVector<CCValAssign, 16> ValLocs;
 +    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
 +    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 +
 +    const Value *RV = Ret->getOperand(0);
 +    unsigned Reg = getRegForValue(RV);
 +    if (Reg == 0)
 +      return false;
 +
 +    // Only handle a single return value for now.
 +    if (ValLocs.size() != 1)
 +      return false;
 +
 +    CCValAssign &VA = ValLocs[0];
 +
 +    // Don't bother handling odd stuff for now.
 +    if (VA.getLocInfo() != CCValAssign::Full)
 +      return false;
 +    // Only handle register returns for now.
 +    if (!VA.isRegLoc())
 +      return false;
 +
 +    // The calling-convention tables for x87 returns don't tell
 +    // the whole story.
 +    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
 +      return false;
 +
 +    unsigned SrcReg = Reg + VA.getValNo();
 +    EVT SrcVT = TLI.getValueType(RV->getType());
 +    EVT DstVT = VA.getValVT();
 +    // Special handling for extended integers.
 +    if (SrcVT != DstVT) {
 +      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
 +        return false;
 +
 +      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
 +        return false;
 +
 +      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
 +
 +      if (SrcVT == MVT::i1) {
 +        if (Outs[0].Flags.isSExt())
 +          return false;
 +        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
 +        SrcVT = MVT::i8;
 +      }
 +      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
 +                                             ISD::SIGN_EXTEND;
 +      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
 +                          SrcReg, /*TODO: Kill=*/false);
 +    }
 +
 +    // Make the copy.
 +    unsigned DstReg = VA.getLocReg();
 +    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 +    // Avoid a cross-class copy. This is very unlikely.
 +    if (!SrcRC->contains(DstReg))
 +      return false;
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
 +
 +    // Add register to return instruction.
 +    RetRegs.push_back(VA.getLocReg());
 +  }
 +
 +  // The x86-64 ABI for returning structs by value requires that we copy
 +  // the sret argument into %rax for the return. We saved the argument into
 +  // a virtual register in the entry block, so now we copy the value out
 +  // and into %rax. We also do the same with %eax for Win32.
 +  if (F.hasStructRetAttr() &&
 +      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
 +    unsigned Reg = X86MFInfo->getSRetReturnReg();
 +    assert(Reg &&
 +           "SRetReturnReg should have been set in LowerFormalArguments()!");
 +    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
 +    RetRegs.push_back(RetReg);
 +  }
 +
 +  // Now emit the RET.
 +  MachineInstrBuilder MIB =
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
 +  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
 +    MIB.addReg(RetRegs[i], RegState::Implicit);
 +  return true;
 +}
 +
 +/// X86SelectLoad - Select and emit code to implement load instructions.
 +///
 +bool X86FastISel::X86SelectLoad(const Instruction *I) {
 +  const LoadInst *LI = cast<LoadInst>(I);
 +
 +  // Atomic loads need special handling.
 +  if (LI->isAtomic())
 +    return false;
 +
 +  MVT VT;
 +  if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
 +    return false;
 +
 +  const Value *Ptr = LI->getPointerOperand();
 +
 +  X86AddressMode AM;
 +  if (!X86SelectAddress(Ptr, AM))
 +    return false;
 +
 +  unsigned ResultReg = 0;
 +  if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
 +    return false;
 +
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
 +  bool HasAVX = Subtarget->hasAVX();
 +  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
 +  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
 +
 +  switch (VT.getSimpleVT().SimpleTy) {
 +  default:       return 0;
 +  case MVT::i8:  return X86::CMP8rr;
 +  case MVT::i16: return X86::CMP16rr;
 +  case MVT::i32: return X86::CMP32rr;
 +  case MVT::i64: return X86::CMP64rr;
 +  case MVT::f32:
 +    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
 +  case MVT::f64:
 +    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
 +  }
 +}
 +
 +/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
 +/// of the comparison, return an opcode that works for the compare (e.g.
 +/// CMP32ri) otherwise return 0.
 +static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
 +  switch (VT.getSimpleVT().SimpleTy) {
 +  // Otherwise, we can't fold the immediate into this comparison.
 +  default: return 0;
 +  case MVT::i8: return X86::CMP8ri;
 +  case MVT::i16: return X86::CMP16ri;
 +  case MVT::i32: return X86::CMP32ri;
 +  case MVT::i64:
 +    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
 +    // field.
 +    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
 +      return X86::CMP64ri32;
 +    return 0;
 +  }
 +}
 +
 +bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
 +                                     EVT VT, DebugLoc CurDbgLoc) {
 +  unsigned Op0Reg = getRegForValue(Op0);
 +  if (Op0Reg == 0) return false;
 +
 +  // Handle 'null' like i32/i64 0.
 +  if (isa<ConstantPointerNull>(Op1))
 +    Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
 +
 +  // We have two options: compare with register or immediate.  If the RHS of
 +  // the compare is an immediate that we can fold into this compare, use
 +  // CMPri, otherwise use CMPrr.
 +  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
 +    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
 +        .addReg(Op0Reg)
 +        .addImm(Op1C->getSExtValue());
 +      return true;
 +    }
 +  }
 +
 +  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
 +  if (CompareOpc == 0) return false;
 +
 +  unsigned Op1Reg = getRegForValue(Op1);
 +  if (Op1Reg == 0) return false;
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
 +    .addReg(Op0Reg)
 +    .addReg(Op1Reg);
 +
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectCmp(const Instruction *I) {
 +  const CmpInst *CI = cast<CmpInst>(I);
 +
 +  MVT VT;
 +  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
 +    return false;
 +
 +  // Try to optimize or fold the cmp.
 +  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 +  unsigned ResultReg = 0;
 +  switch (Predicate) {
 +  default: break;
 +  case CmpInst::FCMP_FALSE: {
 +    ResultReg = createResultReg(&X86::GR32RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
 +            ResultReg);
 +    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
 +                                           X86::sub_8bit);
 +    if (!ResultReg)
 +      return false;
 +    break;
 +  }
 +  case CmpInst::FCMP_TRUE: {
 +    ResultReg = createResultReg(&X86::GR8RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
 +            ResultReg).addImm(1);
 +    break;
 +  }
 +  }
 +
 +  if (ResultReg) {
 +    updateValueMap(I, ResultReg);
 +    return true;
 +  }
 +
 +  const Value *LHS = CI->getOperand(0);
 +  const Value *RHS = CI->getOperand(1);
 +
 +  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
 +  // We don't have to materialize a zero constant for this case and can just use
 +  // %x again on the RHS.
 +  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
 +    const auto *RHSC = dyn_cast<ConstantFP>(RHS);
 +    if (RHSC && RHSC->isNullValue())
 +      RHS = LHS;
 +  }
 +
 +  // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
 +  static unsigned SETFOpcTable[2][3] = {
 +    { X86::SETEr,  X86::SETNPr, X86::AND8rr },
 +    { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
 +  };
 +  unsigned *SETFOpc = nullptr;
 +  switch (Predicate) {
 +  default: break;
 +  case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
 +  case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
 +  }
 +
 +  ResultReg = createResultReg(&X86::GR8RegClass);
 +  if (SETFOpc) {
 +    if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
 +      return false;
 +
 +    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
 +    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
 +            FlagReg1);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
 +            FlagReg2);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
 +            ResultReg).addReg(FlagReg1).addReg(FlagReg2);
 +    updateValueMap(I, ResultReg);
 +    return true;
 +  }
 +
 +  X86::CondCode CC;
 +  bool SwapArgs;
 +  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
 +  assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 +  unsigned Opc = X86::getSETFromCond(CC);
 +
 +  if (SwapArgs)
 +    std::swap(LHS, RHS);
 +
 +  // Emit a compare of LHS/RHS.
 +  if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
 +    return false;
 +
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectZExt(const Instruction *I) {
 +  EVT DstVT = TLI.getValueType(I->getType());
 +  if (!TLI.isTypeLegal(DstVT))
 +    return false;
 +
 +  unsigned ResultReg = getRegForValue(I->getOperand(0));
 +  if (ResultReg == 0)
 +    return false;
 +
 +  // Handle zero-extension from i1 to i8, which is common.
 +  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
 +  if (SrcVT.SimpleTy == MVT::i1) {
 +    // Set the high bits to zero.
 +    ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
 +    SrcVT = MVT::i8;
 +
 +    if (ResultReg == 0)
 +      return false;
 +  }
 +
 +  if (DstVT == MVT::i64) {
 +    // Handle extension to 64-bits via sub-register shenanigans.
 +    unsigned MovInst;
 +
 +    switch (SrcVT.SimpleTy) {
 +    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
 +    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
 +    case MVT::i32: MovInst = X86::MOV32rr;     break;
 +    default: llvm_unreachable("Unexpected zext to i64 source type");
 +    }
 +
 +    unsigned Result32 = createResultReg(&X86::GR32RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
 +      .addReg(ResultReg);
 +
 +    ResultReg = createResultReg(&X86::GR64RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
 +            ResultReg)
 +      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
 +  } else if (DstVT != MVT::i8) {
 +    ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
 +                           ResultReg, /*Kill=*/true);
 +    if (ResultReg == 0)
 +      return false;
 +  }
 +
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectBranch(const Instruction *I) {
 +  // Unconditional branches are selected by tablegen-generated code.
 +  // Handle a conditional branch.
 +  const BranchInst *BI = cast<BranchInst>(I);
 +  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
 +  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 +
 +  // Fold the common case of a conditional branch with a comparison
 +  // in the same block (values defined on other blocks may not have
 +  // initialized registers).
 +  X86::CondCode CC;
 +  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
 +    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
 +      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 +
 +      // Try to optimize or fold the cmp.
 +      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 +      switch (Predicate) {
 +      default: break;
 +      case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
 +      case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true;
 +      }
 +
 +      const Value *CmpLHS = CI->getOperand(0);
 +      const Value *CmpRHS = CI->getOperand(1);
 +
 +      // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
 +      // 0.0.
 +      // We don't have to materialize a zero constant for this case and can just
 +      // use %x again on the RHS.
 +      if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
 +        const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
 +        if (CmpRHSC && CmpRHSC->isNullValue())
 +          CmpRHS = CmpLHS;
 +      }
 +
 +      // Try to take advantage of fallthrough opportunities.
 +      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
 +        std::swap(TrueMBB, FalseMBB);
 +        Predicate = CmpInst::getInversePredicate(Predicate);
 +      }
 +
 +      // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
 +      // code check. Instead two branch instructions are required to check all
 +      // the flags. First we change the predicate to a supported condition code,
 +      // which will be the first branch. Later one we will emit the second
 +      // branch.
 +      bool NeedExtraBranch = false;
 +      switch (Predicate) {
 +      default: break;
 +      case CmpInst::FCMP_OEQ:
 +        std::swap(TrueMBB, FalseMBB); // fall-through
 +      case CmpInst::FCMP_UNE:
 +        NeedExtraBranch = true;
 +        Predicate = CmpInst::FCMP_ONE;
 +        break;
 +      }
 +
 +      bool SwapArgs;
 +      unsigned BranchOpc;
 +      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
 +      assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 +
 +      BranchOpc = X86::GetCondBranchFromCond(CC);
 +      if (SwapArgs)
 +        std::swap(CmpLHS, CmpRHS);
 +
 +      // Emit a compare of the LHS and RHS, setting the flags.
 +      if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
 +        return false;
 +
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
 +        .addMBB(TrueMBB);
 +
 +      // X86 requires a second branch to handle UNE (and OEQ, which is mapped
 +      // to UNE above).
 +      if (NeedExtraBranch) {
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
 +          .addMBB(TrueMBB);
 +      }
 +
 +      // Obtain the branch weight and add the TrueBB to the successor list.
 +      uint32_t BranchWeight = 0;
 +      if (FuncInfo.BPI)
 +        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
 +                                                   TrueMBB->getBasicBlock());
 +      FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
 +
 +      // Emits an unconditional branch to the FalseBB, obtains the branch
 +      // weight, and adds it to the successor list.
 +      fastEmitBranch(FalseMBB, DbgLoc);
 +
 +      return true;
 +    }
 +  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
 +    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
 +    // typically happen for _Bool and C++ bools.
 +    MVT SourceVT;
 +    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
 +        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
 +      unsigned TestOpc = 0;
 +      switch (SourceVT.SimpleTy) {
 +      default: break;
 +      case MVT::i8:  TestOpc = X86::TEST8ri; break;
 +      case MVT::i16: TestOpc = X86::TEST16ri; break;
 +      case MVT::i32: TestOpc = X86::TEST32ri; break;
 +      case MVT::i64: TestOpc = X86::TEST64ri32; break;
 +      }
 +      if (TestOpc) {
 +        unsigned OpReg = getRegForValue(TI->getOperand(0));
 +        if (OpReg == 0) return false;
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
 +          .addReg(OpReg).addImm(1);
 +
 +        unsigned JmpOpc = X86::JNE_1;
 +        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
 +          std::swap(TrueMBB, FalseMBB);
 +          JmpOpc = X86::JE_1;
 +        }
 +
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
 +          .addMBB(TrueMBB);
 +        fastEmitBranch(FalseMBB, DbgLoc);
 +        uint32_t BranchWeight = 0;
 +        if (FuncInfo.BPI)
 +          BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
 +                                                     TrueMBB->getBasicBlock());
 +        FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
 +        return true;
 +      }
 +    }
 +  } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
 +    // Fake request the condition, otherwise the intrinsic might be completely
 +    // optimized away.
 +    unsigned TmpReg = getRegForValue(BI->getCondition());
 +    if (TmpReg == 0)
 +      return false;
 +
 +    unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
 +
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
 +      .addMBB(TrueMBB);
 +    fastEmitBranch(FalseMBB, DbgLoc);
 +    uint32_t BranchWeight = 0;
 +    if (FuncInfo.BPI)
 +      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
 +                                                 TrueMBB->getBasicBlock());
 +    FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
 +    return true;
 +  }
 +
 +  // Otherwise do a clumsy setcc and re-test it.
 +  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
 +  // in an explicit cast, so make sure to handle that correctly.
 +  unsigned OpReg = getRegForValue(BI->getCondition());
 +  if (OpReg == 0) return false;
 +
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
 +    .addReg(OpReg).addImm(1);
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
 +    .addMBB(TrueMBB);
 +  fastEmitBranch(FalseMBB, DbgLoc);
 +  uint32_t BranchWeight = 0;
 +  if (FuncInfo.BPI)
 +    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
 +                                               TrueMBB->getBasicBlock());
 +  FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectShift(const Instruction *I) {
 +  unsigned CReg = 0, OpReg = 0;
 +  const TargetRegisterClass *RC = nullptr;
 +  if (I->getType()->isIntegerTy(8)) {
 +    CReg = X86::CL;
 +    RC = &X86::GR8RegClass;
 +    switch (I->getOpcode()) {
 +    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
 +    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
 +    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
 +    default: return false;
 +    }
 +  } else if (I->getType()->isIntegerTy(16)) {
 +    CReg = X86::CX;
 +    RC = &X86::GR16RegClass;
 +    switch (I->getOpcode()) {
 +    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
 +    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
 +    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
 +    default: return false;
 +    }
 +  } else if (I->getType()->isIntegerTy(32)) {
 +    CReg = X86::ECX;
 +    RC = &X86::GR32RegClass;
 +    switch (I->getOpcode()) {
 +    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
 +    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
 +    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
 +    default: return false;
 +    }
 +  } else if (I->getType()->isIntegerTy(64)) {
 +    CReg = X86::RCX;
 +    RC = &X86::GR64RegClass;
 +    switch (I->getOpcode()) {
 +    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
 +    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
 +    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
 +    default: return false;
 +    }
 +  } else {
 +    return false;
 +  }
 +
 +  MVT VT;
 +  if (!isTypeLegal(I->getType(), VT))
 +    return false;
 +
 +  unsigned Op0Reg = getRegForValue(I->getOperand(0));
 +  if (Op0Reg == 0) return false;
 +
 +  unsigned Op1Reg = getRegForValue(I->getOperand(1));
 +  if (Op1Reg == 0) return false;
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
 +          CReg).addReg(Op1Reg);
 +
 +  // The shift instruction uses X86::CL. If we defined a super-register
 +  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
 +  if (CReg != X86::CL)
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::KILL), X86::CL)
 +      .addReg(CReg, RegState::Kill);
 +
 +  unsigned ResultReg = createResultReg(RC);
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
 +    .addReg(Op0Reg);
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectDivRem(const Instruction *I) {
 +  const static unsigned NumTypes = 4; // i8, i16, i32, i64
 +  const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
 +  const static bool S = true;  // IsSigned
 +  const static bool U = false; // !IsSigned
 +  const static unsigned Copy = TargetOpcode::COPY;
 +  // For the X86 DIV/IDIV instruction, in most cases the dividend
 +  // (numerator) must be in a specific register pair highreg:lowreg,
 +  // producing the quotient in lowreg and the remainder in highreg.
 +  // For most data types, to set up the instruction, the dividend is
 +  // copied into lowreg, and lowreg is sign-extended or zero-extended
 +  // into highreg.  The exception is i8, where the dividend is defined
 +  // as a single register rather than a register pair, and we
 +  // therefore directly sign-extend or zero-extend the dividend into
 +  // lowreg, instead of copying, and ignore the highreg.
 +  const static struct DivRemEntry {
 +    // The following portion depends only on the data type.
 +    const TargetRegisterClass *RC;
 +    unsigned LowInReg;  // low part of the register pair
 +    unsigned HighInReg; // high part of the register pair
 +    // The following portion depends on both the data type and the operation.
 +    struct DivRemResult {
 +    unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
 +    unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
 +                              // highreg, or copying a zero into highreg.
 +    unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
 +                              // zero/sign-extending into lowreg for i8.
 +    unsigned DivRemResultReg; // Register containing the desired result.
 +    bool IsOpSigned;          // Whether to use signed or unsigned form.
 +    } ResultTable[NumOps];
 +  } OpTable[NumTypes] = {
 +    { &X86::GR8RegClass,  X86::AX,  0, {
 +        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
 +        { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
 +        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
 +        { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
 +      }
 +    }, // i8
 +    { &X86::GR16RegClass, X86::AX,  X86::DX, {
 +        { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
 +        { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
 +        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
 +        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
 +      }
 +    }, // i16
 +    { &X86::GR32RegClass, X86::EAX, X86::EDX, {
 +        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
 +        { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
 +        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
 +        { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
 +      }
 +    }, // i32
 +    { &X86::GR64RegClass, X86::RAX, X86::RDX, {
 +        { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
 +        { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
 +        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
 +        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
 +      }
 +    }, // i64
 +  };
 +
 +  MVT VT;
 +  if (!isTypeLegal(I->getType(), VT))
 +    return false;
 +
 +  unsigned TypeIndex, OpIndex;
 +  switch (VT.SimpleTy) {
 +  default: return false;
 +  case MVT::i8:  TypeIndex = 0; break;
 +  case MVT::i16: TypeIndex = 1; break;
 +  case MVT::i32: TypeIndex = 2; break;
 +  case MVT::i64: TypeIndex = 3;
 +    if (!Subtarget->is64Bit())
 +      return false;
 +    break;
 +  }
 +
 +  switch (I->getOpcode()) {
 +  default: llvm_unreachable("Unexpected div/rem opcode");
 +  case Instruction::SDiv: OpIndex = 0; break;
 +  case Instruction::SRem: OpIndex = 1; break;
 +  case Instruction::UDiv: OpIndex = 2; break;
 +  case Instruction::URem: OpIndex = 3; break;
 +  }
 +
 +  const DivRemEntry &TypeEntry = OpTable[TypeIndex];
 +  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
 +  unsigned Op0Reg = getRegForValue(I->getOperand(0));
 +  if (Op0Reg == 0)
 +    return false;
 +  unsigned Op1Reg = getRegForValue(I->getOperand(1));
 +  if (Op1Reg == 0)
 +    return false;
 +
 +  // Move op0 into low-order input register.
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +          TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
 +  // Zero-extend or sign-extend into high-order input register.
 +  if (OpEntry.OpSignExtend) {
 +    if (OpEntry.IsOpSigned)
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(OpEntry.OpSignExtend));
 +    else {
 +      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(X86::MOV32r0), Zero32);
 +
 +      // Copy the zero into the appropriate sub/super/identical physical
 +      // register. Unfortunately the operations needed are not uniform enough
 +      // to fit neatly into the table above.
 +      if (VT.SimpleTy == MVT::i16) {
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                TII.get(Copy), TypeEntry.HighInReg)
 +          .addReg(Zero32, 0, X86::sub_16bit);
 +      } else if (VT.SimpleTy == MVT::i32) {
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                TII.get(Copy), TypeEntry.HighInReg)
 +            .addReg(Zero32);
 +      } else if (VT.SimpleTy == MVT::i64) {
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
 +            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
 +      }
 +    }
 +  }
 +  // Generate the DIV/IDIV instruction.
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +          TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
 +  // For i8 remainder, we can't reference AH directly, as we'll end
 +  // up with bogus copies like %R9B = COPY %AH. Reference AX
 +  // instead to prevent AH references in a REX instruction.
 +  //
 +  // The current assumption of the fast register allocator is that isel
 +  // won't generate explicit references to the GPR8_NOREX registers. If
 +  // the allocator and/or the backend get enhanced to be more robust in
 +  // that regard, this can be, and should be, removed.
 +  unsigned ResultReg = 0;
 +  if ((I->getOpcode() == Instruction::SRem ||
 +       I->getOpcode() == Instruction::URem) &&
 +      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
 +    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
 +    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(Copy), SourceSuperReg).addReg(X86::AX);
 +
 +    // Shift AX right by 8 bits instead of using AH.
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
 +            ResultSuperReg).addReg(SourceSuperReg).addImm(8);
 +
 +    // Now reference the 8-bit subreg of the result.
 +    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
 +                                           /*Kill=*/true, X86::sub_8bit);
 +  }
 +  // Copy the result out of the physreg if we haven't already.
 +  if (!ResultReg) {
 +    ResultReg = createResultReg(TypeEntry.RC);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
 +        .addReg(OpEntry.DivRemResultReg);
 +  }
 +  updateValueMap(I, ResultReg);
 +
 +  return true;
 +}
 +
 +/// \brief Emit a conditional move instruction (if the are supported) to lower
 +/// the select.
 +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
 +  // Check if the subtarget supports these instructions.
 +  if (!Subtarget->hasCMov())
 +    return false;
 +
 +  // FIXME: Add support for i8.
 +  if (RetVT < MVT::i16 || RetVT > MVT::i64)
 +    return false;
 +
 +  const Value *Cond = I->getOperand(0);
 +  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 +  bool NeedTest = true;
 +  X86::CondCode CC = X86::COND_NE;
 +
 +  // Optimize conditions coming from a compare if both instructions are in the
 +  // same basic block (values defined in other basic blocks may not have
 +  // initialized registers).
 +  const auto *CI = dyn_cast<CmpInst>(Cond);
 +  if (CI && (CI->getParent() == I->getParent())) {
 +    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 +
 +    // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
 +    static unsigned SETFOpcTable[2][3] = {
 +      { X86::SETNPr, X86::SETEr , X86::TEST8rr },
 +      { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
 +    };
 +    unsigned *SETFOpc = nullptr;
 +    switch (Predicate) {
 +    default: break;
 +    case CmpInst::FCMP_OEQ:
 +      SETFOpc = &SETFOpcTable[0][0];
 +      Predicate = CmpInst::ICMP_NE;
 +      break;
 +    case CmpInst::FCMP_UNE:
 +      SETFOpc = &SETFOpcTable[1][0];
 +      Predicate = CmpInst::ICMP_NE;
 +      break;
 +    }
 +
 +    bool NeedSwap;
 +    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
 +    assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 +
 +    const Value *CmpLHS = CI->getOperand(0);
 +    const Value *CmpRHS = CI->getOperand(1);
 +    if (NeedSwap)
 +      std::swap(CmpLHS, CmpRHS);
 +
 +    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
 +    // Emit a compare of the LHS and RHS, setting the flags.
 +    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
 +      return false;
 +
 +    if (SETFOpc) {
 +      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
 +      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
 +              FlagReg1);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
 +              FlagReg2);
 +      auto const &II = TII.get(SETFOpc[2]);
 +      if (II.getNumDefs()) {
 +        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
 +          .addReg(FlagReg2).addReg(FlagReg1);
 +      } else {
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
 +          .addReg(FlagReg2).addReg(FlagReg1);
 +      }
 +    }
 +    NeedTest = false;
 +  } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
 +    // Fake request the condition, otherwise the intrinsic might be completely
 +    // optimized away.
 +    unsigned TmpReg = getRegForValue(Cond);
 +    if (TmpReg == 0)
 +      return false;
 +
 +    NeedTest = false;
 +  }
 +
 +  if (NeedTest) {
 +    // Selects operate on i1, however, CondReg is 8 bits width and may contain
 +    // garbage. Indeed, only the less significant bit is supposed to be
 +    // accurate. If we read more than the lsb, we may see non-zero values
 +    // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
 +    // the select. This is achieved by performing TEST against 1.
 +    unsigned CondReg = getRegForValue(Cond);
 +    if (CondReg == 0)
 +      return false;
 +    bool CondIsKill = hasTrivialKill(Cond);
 +
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
 +      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
 +  }
 +
 +  const Value *LHS = I->getOperand(1);
 +  const Value *RHS = I->getOperand(2);
 +
 +  unsigned RHSReg = getRegForValue(RHS);
 +  bool RHSIsKill = hasTrivialKill(RHS);
 +
 +  unsigned LHSReg = getRegForValue(LHS);
 +  bool LHSIsKill = hasTrivialKill(LHS);
 +
 +  if (!LHSReg || !RHSReg)
 +    return false;
 +
 +  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
 +  unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
 +                                       LHSReg, LHSIsKill);
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +/// \brief Emit SSE instructions to lower the select.
 +///
 +/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
 +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
 +/// SSE instructions are available.
 +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
 +  // Optimize conditions coming from a compare if both instructions are in the
 +  // same basic block (values defined in other basic blocks may not have
 +  // initialized registers).
 +  const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
 +  if (!CI || (CI->getParent() != I->getParent()))
 +    return false;
 +
 +  if (I->getType() != CI->getOperand(0)->getType() ||
 +      !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
 +        (Subtarget->hasSSE2() && RetVT == MVT::f64)))
 +    return false;
 +
 +  const Value *CmpLHS = CI->getOperand(0);
 +  const Value *CmpRHS = CI->getOperand(1);
 +  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 +
 +  // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
 +  // We don't have to materialize a zero constant for this case and can just use
 +  // %x again on the RHS.
 +  if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
 +    const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
 +    if (CmpRHSC && CmpRHSC->isNullValue())
 +      CmpRHS = CmpLHS;
 +  }
 +
 +  unsigned CC;
 +  bool NeedSwap;
 +  std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
 +  if (CC > 7)
 +    return false;
 +
 +  if (NeedSwap)
 +    std::swap(CmpLHS, CmpRHS);
 +
 +  static unsigned OpcTable[2][2][4] = {
 +    { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
 +      { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  },
 +    { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  },
 +      { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  }
 +  };
 +
 +  bool HasAVX = Subtarget->hasAVX();
 +  unsigned *Opc = nullptr;
 +  switch (RetVT.SimpleTy) {
 +  default: return false;
 +  case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
 +  case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
 +  }
 +
 +  const Value *LHS = I->getOperand(1);
 +  const Value *RHS = I->getOperand(2);
 +
 +  unsigned LHSReg = getRegForValue(LHS);
 +  bool LHSIsKill = hasTrivialKill(LHS);
 +
 +  unsigned RHSReg = getRegForValue(RHS);
 +  bool RHSIsKill = hasTrivialKill(RHS);
 +
 +  unsigned CmpLHSReg = getRegForValue(CmpLHS);
 +  bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
 +
 +  unsigned CmpRHSReg = getRegForValue(CmpRHS);
 +  bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
 +
 +  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
 +    return false;
 +
 +  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 +  unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
 +                                     CmpRHSReg, CmpRHSIsKill, CC);
 +  unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
 +                                    LHSReg, LHSIsKill);
 +  unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
 +                                     RHSReg, RHSIsKill);
 +  unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
 +                                       AndReg, /*IsKill=*/true);
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
 +  // These are pseudo CMOV instructions and will be later expanded into control-
 +  // flow.
 +  unsigned Opc;
 +  switch (RetVT.SimpleTy) {
 +  default: return false;
 +  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
 +  case MVT::i16: Opc = X86::CMOV_GR16; break;
 +  case MVT::i32: Opc = X86::CMOV_GR32; break;
 +  case MVT::f32: Opc = X86::CMOV_FR32; break;
 +  case MVT::f64: Opc = X86::CMOV_FR64; break;
 +  }
 +
 +  const Value *Cond = I->getOperand(0);
 +  X86::CondCode CC = X86::COND_NE;
 +
 +  // Optimize conditions coming from a compare if both instructions are in the
 +  // same basic block (values defined in other basic blocks may not have
 +  // initialized registers).
 +  const auto *CI = dyn_cast<CmpInst>(Cond);
 +  if (CI && (CI->getParent() == I->getParent())) {
 +    bool NeedSwap;
 +    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
 +    if (CC > X86::LAST_VALID_COND)
 +      return false;
 +
 +    const Value *CmpLHS = CI->getOperand(0);
 +    const Value *CmpRHS = CI->getOperand(1);
 +
 +    if (NeedSwap)
 +      std::swap(CmpLHS, CmpRHS);
 +
 +    EVT CmpVT = TLI.getValueType(CmpLHS->getType());
 +    if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
 +      return false;
 +  } else {
 +    unsigned CondReg = getRegForValue(Cond);
 +    if (CondReg == 0)
 +      return false;
 +    bool CondIsKill = hasTrivialKill(Cond);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
 +      .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
 +  }
 +
 +  const Value *LHS = I->getOperand(1);
 +  const Value *RHS = I->getOperand(2);
 +
 +  unsigned LHSReg = getRegForValue(LHS);
 +  bool LHSIsKill = hasTrivialKill(LHS);
 +
 +  unsigned RHSReg = getRegForValue(RHS);
 +  bool RHSIsKill = hasTrivialKill(RHS);
 +
 +  if (!LHSReg || !RHSReg)
 +    return false;
 +
 +  const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 +
 +  unsigned ResultReg =
 +    fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::X86SelectSelect(const Instruction *I) {
 +  MVT RetVT;
 +  if (!isTypeLegal(I->getType(), RetVT))
 +    return false;
 +
 +  // Check if we can fold the select.
 +  if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
 +    CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 +    const Value *Opnd = nullptr;
 +    switch (Predicate) {
 +    default:                              break;
 +    case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
 +    case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
 +    }
 +    // No need for a select anymore - this is an unconditional move.
 +    if (Opnd) {
 +      unsigned OpReg = getRegForValue(Opnd);
 +      if (OpReg == 0)
 +        return false;
 +      bool OpIsKill = hasTrivialKill(Opnd);
 +      const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 +      unsigned ResultReg = createResultReg(RC);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(TargetOpcode::COPY), ResultReg)
 +        .addReg(OpReg, getKillRegState(OpIsKill));
 +      updateValueMap(I, ResultReg);
 +      return true;
 +    }
 +  }
 +
 +  // First try to use real conditional move instructions.
 +  if (X86FastEmitCMoveSelect(RetVT, I))
 +    return true;
 +
 +  // Try to use a sequence of SSE instructions to simulate a conditional move.
 +  if (X86FastEmitSSESelect(RetVT, I))
 +    return true;
 +
 +  // Fall-back to pseudo conditional move instructions, which will be later
 +  // converted to control-flow.
 +  if (X86FastEmitPseudoSelect(RetVT, I))
 +    return true;
 +
 +  return false;
 +}
 +
 +bool X86FastISel::X86SelectFPExt(const Instruction *I) {
 +  // fpext from float to double.
 +  if (X86ScalarSSEf64 &&
 +      I->getType()->isDoubleTy()) {
 +    const Value *V = I->getOperand(0);
 +    if (V->getType()->isFloatTy()) {
 +      unsigned OpReg = getRegForValue(V);
 +      if (OpReg == 0) return false;
 +      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(X86::CVTSS2SDrr), ResultReg)
 +        .addReg(OpReg);
 +      updateValueMap(I, ResultReg);
 +      return true;
 +    }
 +  }
 +
 +  return false;
 +}
 +
 +bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
 +  if (X86ScalarSSEf64) {
 +    if (I->getType()->isFloatTy()) {
 +      const Value *V = I->getOperand(0);
 +      if (V->getType()->isDoubleTy()) {
 +        unsigned OpReg = getRegForValue(V);
 +        if (OpReg == 0) return false;
 +        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                TII.get(X86::CVTSD2SSrr), ResultReg)
 +          .addReg(OpReg);
 +        updateValueMap(I, ResultReg);
 +        return true;
 +      }
 +    }
 +  }
 +
 +  return false;
 +}
 +
 +bool X86FastISel::X86SelectTrunc(const Instruction *I) {
 +  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
 +  EVT DstVT = TLI.getValueType(I->getType());
 +
 +  // This code only handles truncation to byte.
 +  if (DstVT != MVT::i8 && DstVT != MVT::i1)
 +    return false;
 +  if (!TLI.isTypeLegal(SrcVT))
 +    return false;
 +
 +  unsigned InputReg = getRegForValue(I->getOperand(0));
 +  if (!InputReg)
 +    // Unhandled operand.  Halt "fast" selection and bail.
 +    return false;
 +
 +  if (SrcVT == MVT::i8) {
 +    // Truncate from i8 to i1; no code needed.
 +    updateValueMap(I, InputReg);
 +    return true;
 +  }
 +
 +  if (!Subtarget->is64Bit()) {
 +    // If we're on x86-32; we can't extract an i8 from a general register.
 +    // First issue a copy to GR16_ABCD or GR32_ABCD.
 +    const TargetRegisterClass *CopyRC =
 +      (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
 +    unsigned CopyReg = createResultReg(CopyRC);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
 +    InputReg = CopyReg;
 +  }
 +
 +  // Issue an extract_subreg.
 +  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
 +                                                  InputReg, /*Kill=*/true,
 +                                                  X86::sub_8bit);
 +  if (!ResultReg)
 +    return false;
 +
 +  updateValueMap(I, ResultReg);
 +  return true;
 +}
 +
 +bool X86FastISel::IsMemcpySmall(uint64_t Len) {
 +  return Len <= (Subtarget->is64Bit() ? 32 : 16);
 +}
 +
 +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
 +                                     X86AddressMode SrcAM, uint64_t Len) {
 +
 +  // Make sure we don't bloat code by inlining very large memcpy's.
 +  if (!IsMemcpySmall(Len))
 +    return false;
 +
 +  bool i64Legal = Subtarget->is64Bit();
 +
 +  // We don't care about alignment here since we just emit integer accesses.
 +  while (Len) {
 +    MVT VT;
 +    if (Len >= 8 && i64Legal)
 +      VT = MVT::i64;
 +    else if (Len >= 4)
 +      VT = MVT::i32;
 +    else if (Len >= 2)
 +      VT = MVT::i16;
 +    else
 +      VT = MVT::i8;
 +
 +    unsigned Reg;
 +    bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
 +    RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
 +    assert(RV && "Failed to emit load or store??");
 +
 +    unsigned Size = VT.getSizeInBits()/8;
 +    Len -= Size;
 +    DestAM.Disp += Size;
 +    SrcAM.Disp += Size;
 +  }
 +
 +  return true;
 +}
 +
 +bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
 +  // FIXME: Handle more intrinsics.
 +  switch (II->getIntrinsicID()) {
 +  default: return false;
 +  case Intrinsic::frameaddress: {
 +    Type *RetTy = II->getCalledFunction()->getReturnType();
 +
 +    MVT VT;
 +    if (!isTypeLegal(RetTy, VT))
 +      return false;
 +
 +    unsigned Opc;
 +    const TargetRegisterClass *RC = nullptr;
 +
 +    switch (VT.SimpleTy) {
 +    default: llvm_unreachable("Invalid result type for frameaddress.");
 +    case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
 +    case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
 +    }
 +
 +    // This needs to be set before we call getPtrSizedFrameRegister, otherwise
 +    // we get the wrong frame register.
 +    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
 +    MFI->setFrameAddressIsTaken(true);
 +
 +    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
 +        TM.getSubtargetImpl()->getRegisterInfo());
 +    unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF));
 +    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
 +            (FrameReg == X86::EBP && VT == MVT::i32)) &&
 +           "Invalid Frame Register!");
 +
 +    // Always make a copy of the frame register to to a vreg first, so that we
 +    // never directly reference the frame register (the TwoAddressInstruction-
 +    // Pass doesn't like that).
 +    unsigned SrcReg = createResultReg(RC);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
 +
 +    // Now recursively load from the frame address.
 +    // movq (%rbp), %rax
 +    // movq (%rax), %rax
 +    // movq (%rax), %rax
 +    // ...
 +    unsigned DestReg;
 +    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
 +    while (Depth--) {
 +      DestReg = createResultReg(RC);
 +      addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                           TII.get(Opc), DestReg), SrcReg);
 +      SrcReg = DestReg;
 +    }
 +
 +    updateValueMap(II, SrcReg);
 +    return true;
 +  }
 +  case Intrinsic::memcpy: {
 +    const MemCpyInst *MCI = cast<MemCpyInst>(II);
 +    // Don't handle volatile or variable length memcpys.
 +    if (MCI->isVolatile())
 +      return false;
 +
 +    if (isa<ConstantInt>(MCI->getLength())) {
 +      // Small memcpy's are common enough that we want to do them
 +      // without a call if possible.
 +      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
 +      if (IsMemcpySmall(Len)) {
 +        X86AddressMode DestAM, SrcAM;
 +        if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
 +            !X86SelectAddress(MCI->getRawSource(), SrcAM))
 +          return false;
 +        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
 +        return true;
 +      }
 +    }
 +
 +    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
 +    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
 +      return false;
 +
 +    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
 +      return false;
 +
 +    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
 +  }
 +  case Intrinsic::memset: {
 +    const MemSetInst *MSI = cast<MemSetInst>(II);
 +
 +    if (MSI->isVolatile())
 +      return false;
 +
 +    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
 +    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
 +      return false;
 +
 +    if (MSI->getDestAddressSpace() > 255)
 +      return false;
 +
 +    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
 +  }
 +  case Intrinsic::stackprotector: {
 +    // Emit code to store the stack guard onto the stack.
 +    EVT PtrTy = TLI.getPointerTy();
 +
 +    const Value *Op1 = II->getArgOperand(0); // The guard's value.
 +    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
 +
 +    MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
 +
 +    // Grab the frame index.
 +    X86AddressMode AM;
 +    if (!X86SelectAddress(Slot, AM)) return false;
 +    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
 +    return true;
 +  }
 +  case Intrinsic::dbg_declare: {
 +    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
 +    X86AddressMode AM;
 +    assert(DI->getAddress() && "Null address should be checked earlier!");
 +    if (!X86SelectAddress(DI->getAddress(), AM))
 +      return false;
 +    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
 +    // FIXME may need to add RegState::Debug to any registers produced,
 +    // although ESP/EBP should be the only ones at the moment.
 +    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
 +        .addImm(0)
 +        .addMetadata(DI->getVariable())
 +        .addMetadata(DI->getExpression());
 +    return true;
 +  }
 +  case Intrinsic::trap: {
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
 +    return true;
 +  }
 +  case Intrinsic::sqrt: {
 +    if (!Subtarget->hasSSE1())
 +      return false;
 +
 +    Type *RetTy = II->getCalledFunction()->getReturnType();
 +
 +    MVT VT;
 +    if (!isTypeLegal(RetTy, VT))
 +      return false;
 +
 +    // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
 +    // is not generated by FastISel yet.
 +    // FIXME: Update this code once tablegen can handle it.
 +    static const unsigned SqrtOpc[2][2] = {
 +      {X86::SQRTSSr, X86::VSQRTSSr},
 +      {X86::SQRTSDr, X86::VSQRTSDr}
 +    };
 +    bool HasAVX = Subtarget->hasAVX();
 +    unsigned Opc;
 +    const TargetRegisterClass *RC;
 +    switch (VT.SimpleTy) {
 +    default: return false;
 +    case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
 +    case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
 +    }
 +
 +    const Value *SrcVal = II->getArgOperand(0);
 +    unsigned SrcReg = getRegForValue(SrcVal);
 +
 +    if (SrcReg == 0)
 +      return false;
 +
 +    unsigned ImplicitDefReg = 0;
 +    if (HasAVX) {
 +      ImplicitDefReg = createResultReg(RC);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
 +    }
 +
 +    unsigned ResultReg = createResultReg(RC);
 +    MachineInstrBuilder MIB;
 +    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
 +                  ResultReg);
 +
 +    if (ImplicitDefReg)
 +      MIB.addReg(ImplicitDefReg);
 +
 +    MIB.addReg(SrcReg);
 +
 +    updateValueMap(II, ResultReg);
 +    return true;
 +  }
 +  case Intrinsic::sadd_with_overflow:
 +  case Intrinsic::uadd_with_overflow:
 +  case Intrinsic::ssub_with_overflow:
 +  case Intrinsic::usub_with_overflow:
 +  case Intrinsic::smul_with_overflow:
 +  case Intrinsic::umul_with_overflow: {
 +    // This implements the basic lowering of the xalu with overflow intrinsics
 +    // into add/sub/mul followed by either seto or setb.
 +    const Function *Callee = II->getCalledFunction();
 +    auto *Ty = cast<StructType>(Callee->getReturnType());
 +    Type *RetTy = Ty->getTypeAtIndex(0U);
 +    Type *CondTy = Ty->getTypeAtIndex(1);
 +
 +    MVT VT;
 +    if (!isTypeLegal(RetTy, VT))
 +      return false;
 +
 +    if (VT < MVT::i8 || VT > MVT::i64)
 +      return false;
 +
 +    const Value *LHS = II->getArgOperand(0);
 +    const Value *RHS = II->getArgOperand(1);
 +
 +    // Canonicalize immediate to the RHS.
 +    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
 +        isCommutativeIntrinsic(II))
 +      std::swap(LHS, RHS);
 +
 +    bool UseIncDec = false;
 +    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
 +      UseIncDec = true;
 +
 +    unsigned BaseOpc, CondOpc;
 +    switch (II->getIntrinsicID()) {
 +    default: llvm_unreachable("Unexpected intrinsic!");
 +    case Intrinsic::sadd_with_overflow:
 +      BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
 +      CondOpc = X86::SETOr;
 +      break;
 +    case Intrinsic::uadd_with_overflow:
 +      BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
 +    case Intrinsic::ssub_with_overflow:
 +      BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
 +      CondOpc = X86::SETOr;
 +      break;
 +    case Intrinsic::usub_with_overflow:
 +      BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
 +    case Intrinsic::smul_with_overflow:
 +      BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
 +    case Intrinsic::umul_with_overflow:
 +      BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
 +    }
 +
 +    unsigned LHSReg = getRegForValue(LHS);
 +    if (LHSReg == 0)
 +      return false;
 +    bool LHSIsKill = hasTrivialKill(LHS);
 +
 +    unsigned ResultReg = 0;
 +    // Check if we have an immediate version.
 +    if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
 +      static const unsigned Opc[2][4] = {
 +        { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
 +        { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
 +      };
 +
 +      if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
 +        ResultReg = createResultReg(TLI.getRegClassFor(VT));
 +        bool IsDec = BaseOpc == X86ISD::DEC;
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
 +          .addReg(LHSReg, getKillRegState(LHSIsKill));
 +      } else
 +        ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
 +                                CI->getZExtValue());
 +    }
 +
 +    unsigned RHSReg;
 +    bool RHSIsKill;
 +    if (!ResultReg) {
 +      RHSReg = getRegForValue(RHS);
 +      if (RHSReg == 0)
 +        return false;
 +      RHSIsKill = hasTrivialKill(RHS);
 +      ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
 +                              RHSIsKill);
 +    }
 +
 +    // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
 +    // it manually.
 +    if (BaseOpc == X86ISD::UMUL && !ResultReg) {
 +      static const unsigned MULOpc[] =
 +        { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
 +      static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
 +      // First copy the first operand into RAX, which is an implicit input to
 +      // the X86::MUL*r instruction.
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
 +        .addReg(LHSReg, getKillRegState(LHSIsKill));
 +      ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
 +                                 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
 +    } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
 +      static const unsigned MULOpc[] =
 +        { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
 +      if (VT == MVT::i8) {
 +        // Copy the first operand into AL, which is an implicit input to the
 +        // X86::IMUL8r instruction.
 +        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +               TII.get(TargetOpcode::COPY), X86::AL)
 +          .addReg(LHSReg, getKillRegState(LHSIsKill));
 +        ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
 +                                   RHSIsKill);
 +      } else
 +        ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
 +                                    TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
 +                                    RHSReg, RHSIsKill);
 +    }
 +
 +    if (!ResultReg)
 +      return false;
 +
 +    unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
 +    assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
 +            ResultReg2);
 +
 +    updateValueMap(II, ResultReg, 2);
 +    return true;
 +  }
 +  case Intrinsic::x86_sse_cvttss2si:
 +  case Intrinsic::x86_sse_cvttss2si64:
 +  case Intrinsic::x86_sse2_cvttsd2si:
 +  case Intrinsic::x86_sse2_cvttsd2si64: {
 +    bool IsInputDouble;
 +    switch (II->getIntrinsicID()) {
 +    default: llvm_unreachable("Unexpected intrinsic.");
 +    case Intrinsic::x86_sse_cvttss2si:
 +    case Intrinsic::x86_sse_cvttss2si64:
 +      if (!Subtarget->hasSSE1())
 +        return false;
 +      IsInputDouble = false;
 +      break;
 +    case Intrinsic::x86_sse2_cvttsd2si:
 +    case Intrinsic::x86_sse2_cvttsd2si64:
 +      if (!Subtarget->hasSSE2())
 +        return false;
 +      IsInputDouble = true;
 +      break;
 +    }
 +
 +    Type *RetTy = II->getCalledFunction()->getReturnType();
 +    MVT VT;
 +    if (!isTypeLegal(RetTy, VT))
 +      return false;
 +
 +    static const unsigned CvtOpc[2][2][2] = {
 +      { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
 +        { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
 +      { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
 +        { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
 +    };
 +    bool HasAVX = Subtarget->hasAVX();
 +    unsigned Opc;
 +    switch (VT.SimpleTy) {
 +    default: llvm_unreachable("Unexpected result type.");
 +    case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
 +    case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
 +    }
 +
 +    // Check if we can fold insertelement instructions into the convert.
 +    const Value *Op = II->getArgOperand(0);
 +    while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
 +      const Value *Index = IE->getOperand(2);
 +      if (!isa<ConstantInt>(Index))
 +        break;
 +      unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
 +
 +      if (Idx == 0) {
 +        Op = IE->getOperand(1);
 +        break;
 +      }
 +      Op = IE->getOperand(0);
 +    }
 +
 +    unsigned Reg = getRegForValue(Op);
 +    if (Reg == 0)
 +      return false;
 +
 +    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
 +      .addReg(Reg);
 +
 +    updateValueMap(II, ResultReg);
 +    return true;
 +  }
 +  }
 +}
 +
 +bool X86FastISel::fastLowerArguments() {
 +  if (!FuncInfo.CanLowerReturn)
 +    return false;
 +
 +  const Function *F = FuncInfo.Fn;
 +  if (F->isVarArg())
 +    return false;
 +
 +  CallingConv::ID CC = F->getCallingConv();
 +  if (CC != CallingConv::C)
 +    return false;
 +
 +  if (Subtarget->isCallingConvWin64(CC))
 +    return false;
 +
 +  if (!Subtarget->is64Bit())
 +    return false;
 +
 +  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
 +  unsigned GPRCnt = 0;
 +  unsigned FPRCnt = 0;
 +  unsigned Idx = 0;
 +  for (auto const &Arg : F->args()) {
 +    // The first argument is at index 1.
 +    ++Idx;
 +    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
 +        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
 +        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
 +        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
 +      return false;
 +
 +    Type *ArgTy = Arg.getType();
 +    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
 +      return false;
 +
 +    EVT ArgVT = TLI.getValueType(ArgTy);
 +    if (!ArgVT.isSimple()) return false;
 +    switch (ArgVT.getSimpleVT().SimpleTy) {
 +    default: return false;
 +    case MVT::i32:
 +    case MVT::i64:
 +      ++GPRCnt;
 +      break;
 +    case MVT::f32:
 +    case MVT::f64:
 +      if (!Subtarget->hasSSE1())
 +        return false;
 +      ++FPRCnt;
 +      break;
 +    }
 +
 +    if (GPRCnt > 6)
 +      return false;
 +
 +    if (FPRCnt > 8)
 +      return false;
 +  }
 +
 +  static const MCPhysReg GPR32ArgRegs[] = {
 +    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
 +  };
 +  static const MCPhysReg GPR64ArgRegs[] = {
 +    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
 +  };
 +  static const MCPhysReg XMMArgRegs[] = {
 +    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
 +    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
 +  };
 +
 +  unsigned GPRIdx = 0;
 +  unsigned FPRIdx = 0;
 +  for (auto const &Arg : F->args()) {
 +    MVT VT = TLI.getSimpleValueType(Arg.getType());
 +    const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
 +    unsigned SrcReg;
 +    switch (VT.SimpleTy) {
 +    default: llvm_unreachable("Unexpected value type.");
 +    case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
 +    case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
 +    case MVT::f32: // fall-through
 +    case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
 +    }
 +    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
 +    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
 +    // Without this, EmitLiveInCopies may eliminate the livein if its only
 +    // use is a bitcast (which isn't turned into an instruction).
 +    unsigned ResultReg = createResultReg(RC);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), ResultReg)
 +      .addReg(DstReg, getKillRegState(true));
 +    updateValueMap(&Arg, ResultReg);
 +  }
 +  return true;
 +}
 +
 +static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
 +                                           CallingConv::ID CC,
 +                                           ImmutableCallSite *CS) {
 +  if (Subtarget->is64Bit())
 +    return 0;
 +  if (Subtarget->getTargetTriple().isOSMSVCRT())
 +    return 0;
 +  if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
 +      CC == CallingConv::HiPE)
 +    return 0;
 +  if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
 +    return 0;
 +  if (CS && CS->paramHasAttr(1, Attribute::InReg))
 +    return 0;
 +  return 4;
 +}
 +
 +bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 +  auto &OutVals       = CLI.OutVals;
 +  auto &OutFlags      = CLI.OutFlags;
 +  auto &OutRegs       = CLI.OutRegs;
 +  auto &Ins           = CLI.Ins;
 +  auto &InRegs        = CLI.InRegs;
 +  CallingConv::ID CC  = CLI.CallConv;
 +  bool &IsTailCall    = CLI.IsTailCall;
 +  bool IsVarArg       = CLI.IsVarArg;
 +  const Value *Callee = CLI.Callee;
 +  const char *SymName = CLI.SymName;
 +
 +  bool Is64Bit        = Subtarget->is64Bit();
 +  bool IsWin64        = Subtarget->isCallingConvWin64(CC);
 +
 +  // Handle only C, fastcc, and webkit_js calling conventions for now.
 +  switch (CC) {
 +  default: return false;
 +  case CallingConv::C:
 +  case CallingConv::Fast:
 +  case CallingConv::WebKit_JS:
 +  case CallingConv::X86_FastCall:
 +  case CallingConv::X86_64_Win64:
 +  case CallingConv::X86_64_SysV:
 +    break;
 +  }
 +
 +  // Allow SelectionDAG isel to handle tail calls.
 +  if (IsTailCall)
 +    return false;
 +
 +  // fastcc with -tailcallopt is intended to provide a guaranteed
 +  // tail call optimization. Fastisel doesn't know how to do that.
 +  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
 +    return false;
 +
 +  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
 +  // x86-32. Special handling for x86-64 is implemented.
 +  if (IsVarArg && IsWin64)
 +    return false;
 +
 +  // Don't know about inalloca yet.
 +  if (CLI.CS && CLI.CS->hasInAllocaArgument())
 +    return false;
 +
 +  // Fast-isel doesn't know about callee-pop yet.
 +  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
 +                       TM.Options.GuaranteedTailCallOpt))
 +    return false;
 +
 +  SmallVector<MVT, 16> OutVTs;
 +  SmallVector<unsigned, 16> ArgRegs;
 +
 +  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
 +  // instruction. This is safe because it is common to all FastISel supported
 +  // calling conventions on x86.
 +  for (int i = 0, e = OutVals.size(); i != e; ++i) {
 +    Value *&Val = OutVals[i];
 +    ISD::ArgFlagsTy Flags = OutFlags[i];
 +    if (auto *CI = dyn_cast<ConstantInt>(Val)) {
 +      if (CI->getBitWidth() < 32) {
 +        if (Flags.isSExt())
 +          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
 +        else
 +          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
 +      }
 +    }
 +
 +    // Passing bools around ends up doing a trunc to i1 and passing it.
 +    // Codegen this as an argument + "and 1".
 +    MVT VT;
 +    auto *TI = dyn_cast<TruncInst>(Val);
 +    unsigned ResultReg;
 +    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
 +              (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
 +              TI->hasOneUse()) {
 +      Value *PrevVal = TI->getOperand(0);
 +      ResultReg = getRegForValue(PrevVal);
 +
 +      if (!ResultReg)
 +        return false;
 +
 +      if (!isTypeLegal(PrevVal->getType(), VT))
 +        return false;
 +
 +      ResultReg =
 +        fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
 +    } else {
 +      if (!isTypeLegal(Val->getType(), VT))
 +        return false;
 +      ResultReg = getRegForValue(Val);
 +    }
 +
 +    if (!ResultReg)
 +      return false;
 +
 +    ArgRegs.push_back(ResultReg);
 +    OutVTs.push_back(VT);
 +  }
 +
 +  // Analyze operands of the call, assigning locations to each operand.
 +  SmallVector<CCValAssign, 16> ArgLocs;
 +  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
 +
 +  // Allocate shadow area for Win64
 +  if (IsWin64)
 +    CCInfo.AllocateStack(32, 8);
 +
 +  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 +
 +  // Get a count of how many bytes are to be pushed on the stack.
 +  unsigned NumBytes = CCInfo.getNextStackOffset();
 +
 +  // Issue CALLSEQ_START
 +  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
 +    .addImm(NumBytes).addImm(0);
 +
 +  // Walk the register/memloc assignments, inserting copies/loads.
 +  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
 +      TM.getSubtargetImpl()->getRegisterInfo());
 +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
 +    CCValAssign const &VA = ArgLocs[i];
 +    const Value *ArgVal = OutVals[VA.getValNo()];
 +    MVT ArgVT = OutVTs[VA.getValNo()];
 +
 +    if (ArgVT == MVT::x86mmx)
 +      return false;
 +
 +    unsigned ArgReg = ArgRegs[VA.getValNo()];
 +
 +    // Promote the value if needed.
 +    switch (VA.getLocInfo()) {
 +    case CCValAssign::Full: break;
 +    case CCValAssign::SExt: {
 +      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
 +             "Unexpected extend");
 +      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
 +                                       ArgVT, ArgReg);
 +      assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
 +      ArgVT = VA.getLocVT();
 +      break;
 +    }
 +    case CCValAssign::ZExt: {
 +      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
 +             "Unexpected extend");
 +      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
 +                                       ArgVT, ArgReg);
 +      assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
 +      ArgVT = VA.getLocVT();
 +      break;
 +    }
 +    case CCValAssign::AExt: {
 +      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
 +             "Unexpected extend");
 +      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
 +                                       ArgVT, ArgReg);
 +      if (!Emitted)
 +        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
 +                                    ArgVT, ArgReg);
 +      if (!Emitted)
 +        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
 +                                    ArgVT, ArgReg);
 +
 +      assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
 +      ArgVT = VA.getLocVT();
 +      break;
 +    }
 +    case CCValAssign::BCvt: {
 +      ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
 +                          /*TODO: Kill=*/false);
 +      assert(ArgReg && "Failed to emit a bitcast!");
 +      ArgVT = VA.getLocVT();
 +      break;
 +    }
 +    case CCValAssign::VExt:
 +      // VExt has not been implemented, so this should be impossible to reach
 +      // for now.  However, fallback to Selection DAG isel once implemented.
 +      return false;
 +    case CCValAssign::AExtUpper:
 +    case CCValAssign::SExtUpper:
 +    case CCValAssign::ZExtUpper:
 +    case CCValAssign::FPExt:
 +      llvm_unreachable("Unexpected loc info!");
 +    case CCValAssign::Indirect:
 +      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
 +      // support this.
 +      return false;
 +    }
 +
 +    if (VA.isRegLoc()) {
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
 +      OutRegs.push_back(VA.getLocReg());
 +    } else {
 +      assert(VA.isMemLoc());
 +
 +      // Don't emit stores for undef values.
 +      if (isa<UndefValue>(ArgVal))
 +        continue;
 +
 +      unsigned LocMemOffset = VA.getLocMemOffset();
 +      X86AddressMode AM;
 +      AM.Base.Reg = RegInfo->getStackRegister();
 +      AM.Disp = LocMemOffset;
 +      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
 +      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
 +      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
 +        MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
 +        ArgVT.getStoreSize(), Alignment);
 +      if (Flags.isByVal()) {
 +        X86AddressMode SrcAM;
 +        SrcAM.Base.Reg = ArgReg;
 +        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
 +          return false;
 +      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
 +        // If this is a really simple value, emit this with the Value* version
 +        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
 +        // as it can cause us to reevaluate the argument.
 +        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
 +          return false;
 +      } else {
 +        bool ValIsKill = hasTrivialKill(ArgVal);
 +        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
 +          return false;
 +      }
 +    }
 +  }
 +
 +  // ELF / PIC requires GOT in the EBX register before function calls via PLT
 +  // GOT pointer.
 +  if (Subtarget->isPICStyleGOT()) {
 +    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
 +  }
 +
 +  if (Is64Bit && IsVarArg && !IsWin64) {
 +    // From AMD64 ABI document:
 +    // For calls that may call functions that use varargs or stdargs
 +    // (prototype-less calls or calls to functions containing ellipsis (...) in
 +    // the declaration) %al is used as hidden argument to specify the number
 +    // of SSE registers used. The contents of %al do not need to match exactly
 +    // the number of registers, but must be an ubound on the number of SSE
 +    // registers used and is in the range 0 - 8 inclusive.
 +
 +    // Count the number of XMM registers allocated.
 +    static const MCPhysReg XMMArgRegs[] = {
 +      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
 +      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
 +    };
 +    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
 +    assert((Subtarget->hasSSE1() || !NumXMMRegs)
 +           && "SSE registers cannot be used when SSE is disabled");
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
 +            X86::AL).addImm(NumXMMRegs);
 +  }
 +
 +  // Materialize callee address in a register. FIXME: GV address can be
 +  // handled with a CALLpcrel32 instead.
 +  X86AddressMode CalleeAM;
 +  if (!X86SelectCallAddress(Callee, CalleeAM))
 +    return false;
 +
 +  unsigned CalleeOp = 0;
 +  const GlobalValue *GV = nullptr;
 +  if (CalleeAM.GV != nullptr) {
 +    GV = CalleeAM.GV;
 +  } else if (CalleeAM.Base.Reg != 0) {
 +    CalleeOp = CalleeAM.Base.Reg;
 +  } else
 +    return false;
 +
 +  // Issue the call.
 +  MachineInstrBuilder MIB;
 +  if (CalleeOp) {
 +    // Register-indirect call.
 +    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
 +    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
 +      .addReg(CalleeOp);
 +  } else {
 +    // Direct call.
 +    assert(GV && "Not a direct call");
 +    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
 +
 +    // See if we need any target-specific flags on the GV operand.
 +    unsigned char OpFlags = 0;
 +
 +    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
 +    // external symbols most go through the PLT in PIC mode.  If the symbol
 +    // has hidden or protected visibility, or if it is static or local, then
 +    // we don't need to use the PLT - we can directly call it.
 +    if (Subtarget->isTargetELF() &&
 +        TM.getRelocationModel() == Reloc::PIC_ &&
 +        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
 +      OpFlags = X86II::MO_PLT;
 +    } else if (Subtarget->isPICStyleStubAny() &&
 +               (GV->isDeclaration() || GV->isWeakForLinker()) &&
 +               (!Subtarget->getTargetTriple().isMacOSX() ||
 +                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
 +      // PC-relative references to external symbols should go through $stub,
 +      // unless we're building with the leopard linker or later, which
 +      // automatically synthesizes these stubs.
 +      OpFlags = X86II::MO_DARWIN_STUB;
 +    }
 +
 +    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
 +    if (SymName)
 +      MIB.addExternalSymbol(SymName, OpFlags);
 +    else
 +      MIB.addGlobalAddress(GV, 0, OpFlags);
 +  }
 +
 +  // Add a register mask operand representing the call-preserved registers.
 +  // Proper defs for return values will be added by setPhysRegsDeadExcept().
 +  MIB.addRegMask(TRI.getCallPreservedMask(CC));
 +
 +  // Add an implicit use GOT pointer in EBX.
 +  if (Subtarget->isPICStyleGOT())
 +    MIB.addReg(X86::EBX, RegState::Implicit);
 +
 +  if (Is64Bit && IsVarArg && !IsWin64)
 +    MIB.addReg(X86::AL, RegState::Implicit);
 +
 +  // Add implicit physical register uses to the call.
 +  for (auto Reg : OutRegs)
 +    MIB.addReg(Reg, RegState::Implicit);
 +
 +  // Issue CALLSEQ_END
 +  unsigned NumBytesForCalleeToPop =
 +    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
 +  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
 +    .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
 +
 +  // Now handle call return values.
 +  SmallVector<CCValAssign, 16> RVLocs;
 +  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
 +                    CLI.RetTy->getContext());
 +  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
 +
 +  // Copy all of the result registers out of their specified physreg.
 +  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
 +  for (unsigned i = 0; i != RVLocs.size(); ++i) {
 +    CCValAssign &VA = RVLocs[i];
 +    EVT CopyVT = VA.getValVT();
 +    unsigned CopyReg = ResultReg + i;
 +
 +    // If this is x86-64, and we disabled SSE, we can't return FP values
 +    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
 +        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
 +      report_fatal_error("SSE register return with SSE disabled");
 +    }
 +
 +    // If we prefer to use the value in xmm registers, copy it out as f80 and
 +    // use a truncate to move it from fp stack reg to xmm reg.
 +    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
 +        isScalarFPTypeInSSEReg(VA.getValVT())) {
 +      CopyVT = MVT::f80;
 +      CopyReg = createResultReg(&X86::RFP80RegClass);
 +    }
 +
 +    // Copy out the result.
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
 +    InRegs.push_back(VA.getLocReg());
 +
 +    // Round the f80 to the right size, which also moves it to the appropriate
 +    // xmm register. This is accomplished by storing the f80 value in memory
 +    // and then loading it back.
 +    if (CopyVT != VA.getValVT()) {
 +      EVT ResVT = VA.getValVT();
 +      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
 +      unsigned MemSize = ResVT.getSizeInBits()/8;
 +      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
 +      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                                TII.get(Opc)), FI)
 +        .addReg(CopyReg);
 +      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
 +      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                                TII.get(Opc), ResultReg + i), FI);
 +    }
 +  }
 +
 +  CLI.ResultReg = ResultReg;
 +  CLI.NumResultRegs = RVLocs.size();
 +  CLI.Call = MIB;
 +
 +  return true;
 +}
 +
 +bool
 +X86FastISel::fastSelectInstruction(const Instruction *I)  {
 +  switch (I->getOpcode()) {
 +  default: break;
 +  case Instruction::Load:
 +    return X86SelectLoad(I);
 +  case Instruction::Store:
 +    return X86SelectStore(I);
 +  case Instruction::Ret:
 +    return X86SelectRet(I);
 +  case Instruction::ICmp:
 +  case Instruction::FCmp:
 +    return X86SelectCmp(I);
 +  case Instruction::ZExt:
 +    return X86SelectZExt(I);
 +  case Instruction::Br:
 +    return X86SelectBranch(I);
 +  case Instruction::LShr:
 +  case Instruction::AShr:
 +  case Instruction::Shl:
 +    return X86SelectShift(I);
 +  case Instruction::SDiv:
 +  case Instruction::UDiv:
 +  case Instruction::SRem:
 +  case Instruction::URem:
 +    return X86SelectDivRem(I);
 +  case Instruction::Select:
 +    return X86SelectSelect(I);
 +  case Instruction::Trunc:
 +    return X86SelectTrunc(I);
 +  case Instruction::FPExt:
 +    return X86SelectFPExt(I);
 +  case Instruction::FPTrunc:
 +    return X86SelectFPTrunc(I);
 +  case Instruction::IntToPtr: // Deliberate fall-through.
 +  case Instruction::PtrToInt: {
 +    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
 +    EVT DstVT = TLI.getValueType(I->getType());
 +    if (DstVT.bitsGT(SrcVT))
 +      return X86SelectZExt(I);
 +    if (DstVT.bitsLT(SrcVT))
 +      return X86SelectTrunc(I);
 +    unsigned Reg = getRegForValue(I->getOperand(0));
 +    if (Reg == 0) return false;
 +    updateValueMap(I, Reg);
 +    return true;
 +  }
 +  }
 +
 +  return false;
 +}
 +
 +unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
 +  if (VT > MVT::i64)
 +    return 0;
 +
 +  uint64_t Imm = CI->getZExtValue();
 +  if (Imm == 0) {
 +    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
 +    switch (VT.SimpleTy) {
 +    default: llvm_unreachable("Unexpected value type");
 +    case MVT::i1:
 +    case MVT::i8:
 +      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
 +                                        X86::sub_8bit);
 +    case MVT::i16:
 +      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
 +                                        X86::sub_16bit);
 +    case MVT::i32:
 +      return SrcReg;
 +    case MVT::i64: {
 +      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
 +        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
 +      return ResultReg;
 +    }
 +    }
 +  }
 +
 +  unsigned Opc = 0;
 +  switch (VT.SimpleTy) {
 +  default: llvm_unreachable("Unexpected value type");
 +  case MVT::i1:  VT = MVT::i8; // fall-through
 +  case MVT::i8:  Opc = X86::MOV8ri;  break;
 +  case MVT::i16: Opc = X86::MOV16ri; break;
 +  case MVT::i32: Opc = X86::MOV32ri; break;
 +  case MVT::i64: {
 +    if (isUInt<32>(Imm))
 +      Opc = X86::MOV32ri;
 +    else if (isInt<32>(Imm))
 +      Opc = X86::MOV64ri32;
 +    else
 +      Opc = X86::MOV64ri;
 +    break;
 +  }
 +  }
 +  if (VT == MVT::i64 && Opc == X86::MOV32ri) {
 +    unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
 +    unsigned ResultReg = createResultReg(&X86::GR64RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +            TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
 +      .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
 +    return ResultReg;
 +  }
 +  return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
 +}
 +
 +unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
 +  if (CFP->isNullValue())
 +    return fastMaterializeFloatZero(CFP);
 +
 +  // Can't handle alternate code models yet.
 +  CodeModel::Model CM = TM.getCodeModel();
 +  if (CM != CodeModel::Small && CM != CodeModel::Large)
 +    return 0;
 +
 +  // Get opcode and regclass of the output for the given load instruction.
 +  unsigned Opc = 0;
 +  const TargetRegisterClass *RC = nullptr;
 +  switch (VT.SimpleTy) {
 +  default: return 0;
 +  case MVT::f32:
 +    if (X86ScalarSSEf32) {
 +      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
 +      RC  = &X86::FR32RegClass;
 +    } else {
 +      Opc = X86::LD_Fp32m;
 +      RC  = &X86::RFP32RegClass;
 +    }
 +    break;
 +  case MVT::f64:
 +    if (X86ScalarSSEf64) {
 +      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
 +      RC  = &X86::FR64RegClass;
 +    } else {
 +      Opc = X86::LD_Fp64m;
 +      RC  = &X86::RFP64RegClass;
 +    }
 +    break;
 +  case MVT::f80:
 +    // No f80 support yet.
 +    return 0;
 +  }
 +
 +  // MachineConstantPool wants an explicit alignment.
 +  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
 +  if (Align == 0) {
 +    // Alignment of vector types. FIXME!
 +    Align = DL.getTypeAllocSize(CFP->getType());
 +  }
 +
 +  // x86-32 PIC requires a PIC base register for constant pools.
 +  unsigned PICBase = 0;
 +  unsigned char OpFlag = 0;
 +  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
 +    OpFlag = X86II::MO_PIC_BASE_OFFSET;
 +    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
 +  } else if (Subtarget->isPICStyleGOT()) {
 +    OpFlag = X86II::MO_GOTOFF;
 +    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
 +  } else if (Subtarget->isPICStyleRIPRel() &&
 +             TM.getCodeModel() == CodeModel::Small) {
 +    PICBase = X86::RIP;
 +  }
 +
 +  // Create the load from the constant pool.
 +  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
 +  unsigned ResultReg = createResultReg(RC);
 +
 +  if (CM == CodeModel::Large) {
 +    unsigned AddrReg = createResultReg(&X86::GR64RegClass);
 +    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
 +            AddrReg)
 +      .addConstantPoolIndex(CPI, 0, OpFlag);
 +    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                                      TII.get(Opc), ResultReg);
 +    addDirectMem(MIB, AddrReg);
 +    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
 +        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
 +        TM.getDataLayout()->getPointerSize(), Align);
 +    MIB->addMemOperand(*FuncInfo.MF, MMO);
 +    return ResultReg;
 +  }
 +
 +  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                                   TII.get(Opc), ResultReg),
 +                           CPI, PICBase, OpFlag);
 +  return ResultReg;
 +}
 +
 +unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
 +  // Can't handle alternate code models yet.
 +  if (TM.getCodeModel() != CodeModel::Small)
 +    return 0;
 +
 +  // Materialize addresses with LEA/MOV instructions.
 +  X86AddressMode AM;
 +  if (X86SelectAddress(GV, AM)) {
 +    // If the expression is just a basereg, then we're done, otherwise we need
 +    // to emit an LEA.
 +    if (AM.BaseType == X86AddressMode::RegBase &&
 +        AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
 +      return AM.Base.Reg;
 +
 +    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
 +    if (TM.getRelocationModel() == Reloc::Static &&
 +        TLI.getPointerTy() == MVT::i64) {
 +      // The displacement code could be more than 32 bits away so we need to use
 +      // an instruction with a 64 bit immediate
 +      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
 +              ResultReg)
 +        .addGlobalAddress(GV);
 +    } else {
 +      unsigned Opc = TLI.getPointerTy() == MVT::i32
 +                     ? (Subtarget->isTarget64BitILP32()
 +                        ? X86::LEA64_32r : X86::LEA32r)
 +                     : X86::LEA64r;
 +      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                             TII.get(Opc), ResultReg), AM);
 +    }
 +    return ResultReg;
 +  }
 +  return 0;
 +}
 +
 +unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
 +  EVT CEVT = TLI.getValueType(C->getType(), true);
 +
 +  // Only handle simple types.
 +  if (!CEVT.isSimple())
 +    return 0;
 +  MVT VT = CEVT.getSimpleVT();
 +
 +  if (const auto *CI = dyn_cast<ConstantInt>(C))
 +    return X86MaterializeInt(CI, VT);
 +  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
 +    return X86MaterializeFP(CFP, VT);
 +  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
 +    return X86MaterializeGV(GV, VT);
 +
 +  return 0;
 +}
 +
 +unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
 +  // Fail on dynamic allocas. At this point, getRegForValue has already
 +  // checked its CSE maps, so if we're here trying to handle a dynamic
 +  // alloca, we're not going to succeed. X86SelectAddress has a
 +  // check for dynamic allocas, because it's called directly from
 +  // various places, but targetMaterializeAlloca also needs a check
 +  // in order to avoid recursion between getRegForValue,
 +  // X86SelectAddrss, and targetMaterializeAlloca.
 +  if (!FuncInfo.StaticAllocaMap.count(C))
 +    return 0;
 +  assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
 +
 +  X86AddressMode AM;
 +  if (!X86SelectAddress(C, AM))
 +    return 0;
 +  unsigned Opc = TLI.getPointerTy() == MVT::i32
 +                 ? (Subtarget->isTarget64BitILP32()
 +                    ? X86::LEA64_32r : X86::LEA32r)
 +                 : X86::LEA64r;
 +  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
 +  unsigned ResultReg = createResultReg(RC);
 +  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
 +                         TII.get(Opc), ResultReg), AM);
 +  return ResultReg;
 +}
 +
 +unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
 +  MVT VT;
 +  if (!isTypeLegal(CF->getType(), VT))
 +    return 0;
 +
 +  // Get opcode and regclass for the given zero.
 +  unsigned Opc = 0;
 +  const TargetRegisterClass *RC = nullptr;
 +  switch (VT.SimpleTy) {
 +  default: return 0;
 +  case MVT::f32:
 +    if (X86ScalarSSEf32) {
 +      Opc = X86::FsFLD0SS;
 +      RC  = &X86::FR32RegClass;
 +    } else {
 +      Opc = X86::LD_Fp032;
 +      RC  = &X86::RFP32RegClass;
 +    }
 +    break;
 +  case MVT::f64:
 +    if (X86ScalarSSEf64) {
 +      Opc = X86::FsFLD0SD;
 +      RC  = &X86::FR64RegClass;
 +    } else {
 +      Opc = X86::LD_Fp064;
 +      RC  = &X86::RFP64RegClass;
 +    }
 +    break;
 +  case MVT::f80:
 +    // No f80 support yet.
 +    return 0;
 +  }
 +
 +  unsigned ResultReg = createResultReg(RC);
 +  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
 +  return ResultReg;
 +}
 +
 +
 +bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 +                                      const LoadInst *LI) {
 +  const Value *Ptr = LI->getPointerOperand();
 +  X86AddressMode AM;
 +  if (!X86SelectAddress(Ptr, AM))
 +    return false;
 +
 +  const X86InstrInfo &XII = (const X86InstrInfo &)TII;
 +
 +  unsigned Size = DL.getTypeAllocSize(LI->getType());
 +  unsigned Alignment = LI->getAlignment();
 +
 +  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
 +    Alignment = DL.getABITypeAlignment(LI->getType());
 +
 +  SmallVector<MachineOperand, 8> AddrOps;
 +  AM.getFullAddress(AddrOps);
 +
 +  MachineInstr *Result =
 +    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
 +                              Size, Alignment, /*AllowCommute=*/true);
 +  if (!Result)
 +    return false;
 +
 +  Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
 +  FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
 +  MI->eraseFromParent();
 +  return true;
 +}
 +
 +
 +namespace llvm {
 +  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
 +                                const TargetLibraryInfo *libInfo) {
 +    return new X86FastISel(funcInfo, libInfo);
 +  }
 +}
 diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 930163c3688..80b141654c0 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,2103 +1,2020 @@ -//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the X86 implementation of TargetFrameLowering class. -// -//===----------------------------------------------------------------------===// - -#include "X86FrameLowering.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86MachineFunctionInfo.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Support/Debug.h" -#include <cstdlib> - -using namespace llvm; - -// FIXME: completely move here. -extern cl::opt<bool> ForceStackAlign; - -bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { -  return !MF.getFrameInfo()->hasVarSizedObjects(); -} - -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register.  This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -bool X86FrameLowering::hasFP(const MachineFunction &MF) const { -  const MachineFrameInfo *MFI = MF.getFrameInfo(); -  const MachineModuleInfo &MMI = MF.getMMI(); -  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); - -  return (MF.getTarget().Options.DisableFramePointerElim(MF) || -          RegInfo->needsStackRealignment(MF) || -          MFI->hasVarSizedObjects() || -          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || -          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || -          MMI.callsUnwindInit() || MMI.callsEHReturn() || -          MFI->hasStackMap() || MFI->hasPatchPoint()); -} - -static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { -  if (IsLP64) { -    if (isInt<8>(Imm)) -      return X86::SUB64ri8; -    return X86::SUB64ri32; -  } else { -    if (isInt<8>(Imm)) -      return X86::SUB32ri8; -    return X86::SUB32ri; -  } -} - -static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { -  if (IsLP64) { -    if (isInt<8>(Imm)) -      return X86::ADD64ri8; -    return X86::ADD64ri32; -  } else { -    if (isInt<8>(Imm)) -      return X86::ADD32ri8; -    return X86::ADD32ri; -  } -} - -static unsigned getSUBrrOpcode(unsigned isLP64) { -  return isLP64 ? X86::SUB64rr : X86::SUB32rr; -} - -static unsigned getADDrrOpcode(unsigned isLP64) { -  return isLP64 ? X86::ADD64rr : X86::ADD32rr; -} - -static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { -  if (IsLP64) { -    if (isInt<8>(Imm)) -      return X86::AND64ri8; -    return X86::AND64ri32; -  } -  if (isInt<8>(Imm)) -    return X86::AND32ri8; -  return X86::AND32ri; -} - -static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) { -  // We don't support LP64 for now. -  assert(!IsLP64); - -  if (MO.isImm() && isInt<8>(MO.getImm())) -    return X86::PUSH32i8; - -  return X86::PUSHi32;; -} - -static unsigned getLEArOpcode(unsigned IsLP64) { -  return IsLP64 ? X86::LEA64r : X86::LEA32r; -} - -/// findDeadCallerSavedReg - Return a caller-saved register that isn't live -/// when it reaches the "return" instruction. We can then pop a stack object -/// to this register without worry about clobbering it. -static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, -                                       MachineBasicBlock::iterator &MBBI, -                                       const TargetRegisterInfo &TRI, -                                       bool Is64Bit) { -  const MachineFunction *MF = MBB.getParent(); -  const Function *F = MF->getFunction(); -  if (!F || MF->getMMI().callsEHReturn()) -    return 0; - -  static const uint16_t CallerSavedRegs32Bit[] = { -    X86::EAX, X86::EDX, X86::ECX, 0 -  }; - -  static const uint16_t CallerSavedRegs64Bit[] = { -    X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, -    X86::R8,  X86::R9,  X86::R10, X86::R11, 0 -  }; - -  unsigned Opc = MBBI->getOpcode(); -  switch (Opc) { -  default: return 0; -  case X86::RETL: -  case X86::RETQ: -  case X86::RETIL: -  case X86::RETIQ: -  case X86::TCRETURNdi: -  case X86::TCRETURNri: -  case X86::TCRETURNmi: -  case X86::TCRETURNdi64: -  case X86::TCRETURNri64: -  case X86::TCRETURNmi64: -  case X86::EH_RETURN: -  case X86::EH_RETURN64: { -    SmallSet<uint16_t, 8> Uses; -    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { -      MachineOperand &MO = MBBI->getOperand(i); -      if (!MO.isReg() || MO.isDef()) -        continue; -      unsigned Reg = MO.getReg(); -      if (!Reg) -        continue; -      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) -        Uses.insert(*AI); -    } - -    const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; -    for (; *CS; ++CS) -      if (!Uses.count(*CS)) -        return *CS; -  } -  } - -  return 0; -} - -static bool isEAXLiveIn(MachineFunction &MF) { -  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), -       EE = MF.getRegInfo().livein_end(); II != EE; ++II) { -    unsigned Reg = II->first; - -    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || -        Reg == X86::AH || Reg == X86::AL) -      return true; -  } - -  return false; -} - -/// emitSPUpdate - Emit a series of instructions to increment / decrement the -/// stack pointer by a constant value. -static -void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, -                  unsigned StackPtr, int64_t NumBytes, -                  bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, -                  const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { -  bool isSub = NumBytes < 0; -  uint64_t Offset = isSub ? -NumBytes : NumBytes; -  unsigned Opc; -  if (UseLEA) -    Opc = getLEArOpcode(Is64BitStackPtr); -  else -    Opc = isSub -      ? getSUBriOpcode(Is64BitStackPtr, Offset) -      : getADDriOpcode(Is64BitStackPtr, Offset); - -  uint64_t Chunk = (1LL << 31) - 1; -  DebugLoc DL = MBB.findDebugLoc(MBBI); - -  while (Offset) { -    if (Offset > Chunk) { -      // Rather than emit a long series of instructions for large offsets, -      // load the offset into a register and do one sub/add -      unsigned Reg = 0; - -      if (isSub && !isEAXLiveIn(*MBB.getParent())) -        Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); -      else -        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); - -      if (Reg) { -        Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; -        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) -          .addImm(Offset); -        Opc = isSub -          ? getSUBrrOpcode(Is64BitTarget) -          : getADDrrOpcode(Is64BitTarget); -        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) -          .addReg(StackPtr) -          .addReg(Reg); -        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. -        Offset = 0; -        continue; -      } -    } - -    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; -    if (ThisVal == (Is64BitTarget ? 8 : 4)) { -      // Use push / pop instead. -      unsigned Reg = isSub -        ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) -        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); -      if (Reg) { -        Opc = isSub -          ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) -          : (Is64BitTarget ? X86::POP64r  : X86::POP32r); -        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) -          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); -        if (isSub) -          MI->setFlag(MachineInstr::FrameSetup); -        Offset -= ThisVal; -        continue; -      } -    } - -    MachineInstr *MI = nullptr; - -    if (UseLEA) { -      MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), -                          StackPtr, false, isSub ? -ThisVal : ThisVal); -    } else { -      MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) -            .addReg(StackPtr) -            .addImm(ThisVal); -      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. -    } - -    if (isSub) -      MI->setFlag(MachineInstr::FrameSetup); - -    Offset -= ThisVal; -  } -} - -/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. -static -void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, -                      unsigned StackPtr, uint64_t *NumBytes = nullptr) { -  if (MBBI == MBB.begin()) return; - -  MachineBasicBlock::iterator PI = std::prev(MBBI); -  unsigned Opc = PI->getOpcode(); -  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || -       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || -       Opc == X86::LEA32r || Opc == X86::LEA64_32r) && -      PI->getOperand(0).getReg() == StackPtr) { -    if (NumBytes) -      *NumBytes += PI->getOperand(2).getImm(); -    MBB.erase(PI); -  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || -              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && -             PI->getOperand(0).getReg() == StackPtr) { -    if (NumBytes) -      *NumBytes -= PI->getOperand(2).getImm(); -    MBB.erase(PI); -  } -} - -/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower -/// iterator. -static -void mergeSPUpdatesDown(MachineBasicBlock &MBB, -                        MachineBasicBlock::iterator &MBBI, -                        unsigned StackPtr, uint64_t *NumBytes = nullptr) { -  // FIXME:  THIS ISN'T RUN!!! -  return; - -  if (MBBI == MBB.end()) return; - -  MachineBasicBlock::iterator NI = std::next(MBBI); -  if (NI == MBB.end()) return; - -  unsigned Opc = NI->getOpcode(); -  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || -       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && -      NI->getOperand(0).getReg() == StackPtr) { -    if (NumBytes) -      *NumBytes -= NI->getOperand(2).getImm(); -    MBB.erase(NI); -    MBBI = NI; -  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || -              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && -             NI->getOperand(0).getReg() == StackPtr) { -    if (NumBytes) -      *NumBytes += NI->getOperand(2).getImm(); -    MBB.erase(NI); -    MBBI = NI; -  } -} - -/// mergeSPUpdates - Checks the instruction before/after the passed -/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and -/// the stack adjustment is returned as a positive value for ADD/LEA and a -/// negative for SUB. -static int mergeSPUpdates(MachineBasicBlock &MBB, -                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr, -                          bool doMergeWithPrevious) { -  if ((doMergeWithPrevious && MBBI == MBB.begin()) || -      (!doMergeWithPrevious && MBBI == MBB.end())) -    return 0; - -  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; -  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr -                                                       : std::next(MBBI); -  unsigned Opc = PI->getOpcode(); -  int Offset = 0; - -  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || -       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || -       Opc == X86::LEA32r || Opc == X86::LEA64_32r) && -      PI->getOperand(0).getReg() == StackPtr){ -    Offset += PI->getOperand(2).getImm(); -    MBB.erase(PI); -    if (!doMergeWithPrevious) MBBI = NI; -  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || -              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && -             PI->getOperand(0).getReg() == StackPtr) { -    Offset -= PI->getOperand(2).getImm(); -    MBB.erase(PI); -    if (!doMergeWithPrevious) MBBI = NI; -  } - -  return Offset; -} - -void -X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, -                                            MachineBasicBlock::iterator MBBI, -                                            DebugLoc DL) const { -  MachineFunction &MF = *MBB.getParent(); -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  MachineModuleInfo &MMI = MF.getMMI(); -  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - -  // Add callee saved registers to move list. -  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); -  if (CSI.empty()) return; - -  // Calculate offsets. -  for (std::vector<CalleeSavedInfo>::const_iterator -         I = CSI.begin(), E = CSI.end(); I != E; ++I) { -    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); -    unsigned Reg = I->getReg(); - -    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); -    unsigned CFIIndex = -        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -                                                        Offset)); -    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -        .addCFIIndex(CFIIndex); -  } -} - -/// usesTheStack - This function checks if any of the users of EFLAGS -/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has -/// to use the stack, and if we don't adjust the stack we clobber the first -/// frame index. -/// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(const MachineFunction &MF) { -  const MachineRegisterInfo &MRI = MF.getRegInfo(); - -  for (MachineRegisterInfo::reg_instr_iterator -       ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); -       ri != re; ++ri) -    if (ri->isCopy()) -      return true; - -  return false; -} - -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, -                                          MachineBasicBlock &MBB, -                                          MachineBasicBlock::iterator MBBI, -                                          DebugLoc DL) { -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  bool Is64Bit = STI.is64Bit(); -  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; - -  unsigned CallOp; -  if (Is64Bit) -    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; -  else -    CallOp = X86::CALLpcrel32; - -  const char *Symbol; -  if (Is64Bit) { -    if (STI.isTargetCygMing()) { -      Symbol = "___chkstk_ms"; -    } else { -      Symbol = "__chkstk"; -    } -  } else if (STI.isTargetCygMing()) -    Symbol = "_alloca"; -  else -    Symbol = "_chkstk"; - -  MachineInstrBuilder CI; - -  // All current stack probes take AX and SP as input, clobber flags, and -  // preserve all registers. x86_64 probes leave RSP unmodified. -  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { -    // For the large code model, we have to call through a register. Use R11, -    // as it is scratch in all supported calling conventions. -    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) -        .addExternalSymbol(Symbol); -    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); -  } else { -    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); -  } - -  unsigned AX = Is64Bit ? X86::RAX : X86::EAX; -  unsigned SP = Is64Bit ? X86::RSP : X86::ESP; -  CI.addReg(AX, RegState::Implicit) -      .addReg(SP, RegState::Implicit) -      .addReg(AX, RegState::Define | RegState::Implicit) -      .addReg(SP, RegState::Define | RegState::Implicit) -      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - -  if (Is64Bit) { -    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp -    // themselves. It also does not clobber %rax so we can reuse it when -    // adjusting %rsp. -    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) -        .addReg(X86::RSP) -        .addReg(X86::RAX); -  } -} - -/// emitPrologue - Push callee-saved registers onto the stack, which -/// automatically adjust the stack pointer. Adjust the stack pointer to allocate -/// space for local variables. Also emit labels used by the exception handler to -/// generate the exception handling frames. - -/* -  Here's a gist of what gets emitted: - -  ; Establish frame pointer, if needed -  [if needs FP] -      push  %rbp -      .cfi_def_cfa_offset 16 -      .cfi_offset %rbp, -16 -      .seh_pushreg %rpb -      mov  %rsp, %rbp -      .cfi_def_cfa_register %rbp - -  ; Spill general-purpose registers -  [for all callee-saved GPRs] -      pushq %<reg> -      [if not needs FP] -         .cfi_def_cfa_offset (offset from RETADDR) -      .seh_pushreg %<reg> - -  ; If the required stack alignment > default stack alignment -  ; rsp needs to be re-aligned.  This creates a "re-alignment gap" -  ; of unknown size in the stack frame. -  [if stack needs re-alignment] -      and  $MASK, %rsp - -  ; Allocate space for locals -  [if target is Windows and allocated space > 4096 bytes] -      ; Windows needs special care for allocations larger -      ; than one page. -      mov $NNN, %rax -      call ___chkstk_ms/___chkstk -      sub  %rax, %rsp -  [else] -      sub  $NNN, %rsp - -  [if needs FP] -      .seh_stackalloc (size of XMM spill slots) -      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots -  [else] -      .seh_stackalloc NNN - -  ; Spill XMMs -  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, -  ; they may get spilled on any platform, if the current function -  ; calls @llvm.eh.unwind.init -  [if needs FP] -      [for all callee-saved XMM registers] -          movaps  %<xmm reg>, -MMM(%rbp) -      [for all callee-saved XMM registers] -          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset) -              ; i.e. the offset relative to (%rbp - SEHFrameOffset) -  [else] -      [for all callee-saved XMM registers] -          movaps  %<xmm reg>, KKK(%rsp) -      [for all callee-saved XMM registers] -          .seh_savexmm %<xmm reg>, KKK - -  .seh_endprologue - -  [if needs base pointer] -      mov  %rsp, %rbx -      [if needs to restore base pointer] -          mov %rsp, -MMM(%rbp) - -  ; Emit CFI info -  [if needs FP] -      [for all callee-saved registers] -          .cfi_offset %<reg>, (offset from %rbp) -  [else] -       .cfi_def_cfa_offset (offset from RETADDR) -      [for all callee-saved registers] -          .cfi_offset %<reg>, (offset from %rsp) - -  Notes: -  - .seh directives are emitted only for Windows 64 ABI -  - .cfi directives are emitted for all other ABIs -  - for 32-bit code, substitute %e?? registers for %r?? -*/ - -void X86FrameLowering::emitPrologue(MachineFunction &MF) const { -  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. -  MachineBasicBlock::iterator MBBI = MBB.begin(); -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  const Function *Fn = MF.getFunction(); -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  MachineModuleInfo &MMI = MF.getMMI(); -  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); -  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment. -  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate. -  bool HasFP = hasFP(MF); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  bool Is64Bit = STI.is64Bit(); -  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. -  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); -  bool IsWin64 = STI.isTargetWin64(); -  // Not necessarily synonymous with IsWin64. -  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); -  bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); -  bool NeedsDwarfCFI = -      !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); -  bool UseLEA = STI.useLeaForSP(); -  unsigned StackAlign = getStackAlignment(); -  unsigned SlotSize = RegInfo->getSlotSize(); -  unsigned FramePtr = RegInfo->getFrameRegister(MF); -  const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? -                 getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; -  unsigned StackPtr = RegInfo->getStackRegister(); -  unsigned BasePtr = RegInfo->getBaseRegister(); -  DebugLoc DL; - -  // If we're forcing a stack realignment we can't rely on just the frame -  // info, we need to know the ABI stack alignment as well in case we -  // have a call out.  Otherwise just make sure we have some alignment - we'll -  // go with the minimum SlotSize. -  if (ForceStackAlign) { -    if (MFI->hasCalls()) -      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; -    else if (MaxAlign < SlotSize) -      MaxAlign = SlotSize; -  } - -  // Add RETADDR move area to callee saved frame size. -  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); -  if (TailCallReturnAddrDelta < 0) -    X86FI->setCalleeSavedFrameSize( -      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - -  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); - -  // The default stack probe size is 4096 if the function has no stackprobesize -  // attribute. -  unsigned StackProbeSize = 4096; -  if (Fn->hasFnAttribute("stack-probe-size")) -    Fn->getFnAttribute("stack-probe-size") -        .getValueAsString() -        .getAsInteger(0, StackProbeSize); - -  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf -  // function, and use up to 128 bytes of stack space, don't have a frame -  // pointer, calls, or dynamic alloca then we do not need to adjust the -  // stack pointer (we fit in the Red Zone). We also check that we don't -  // push and pop from the stack. -  if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, -                                                   Attribute::NoRedZone) && -      !RegInfo->needsStackRealignment(MF) && -      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca. -      !MFI->adjustsStack() &&                           // No calls. -      !IsWin64 &&                                       // Win64 has no Red Zone -      !usesTheStack(MF) &&                              // Don't push and pop. -      !MF.shouldSplitStack()) {                         // Regular stack -    uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); -    if (HasFP) MinSize += SlotSize; -    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); -    MFI->setStackSize(StackSize); -  } - -  // Insert stack pointer adjustment for later moving of return addr.  Only -  // applies to tail call optimized functions where the callee argument stack -  // size is bigger than the callers. -  if (TailCallReturnAddrDelta < 0) { -    MachineInstr *MI = -      BuildMI(MBB, MBBI, DL, -              TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), -              StackPtr) -        .addReg(StackPtr) -        .addImm(-TailCallReturnAddrDelta) -        .setMIFlag(MachineInstr::FrameSetup); -    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. -  } - -  // Mapping for machine moves: -  // -  //   DST: VirtualFP AND -  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset -  //        ELSE                        => DW_CFA_def_cfa -  // -  //   SRC: VirtualFP AND -  //        DST: Register               => DW_CFA_def_cfa_register -  // -  //   ELSE -  //        OFFSET < 0                  => DW_CFA_offset_extended_sf -  //        REG < 64                    => DW_CFA_offset + Reg -  //        ELSE                        => DW_CFA_offset_extended - -  uint64_t NumBytes = 0; -  int stackGrowth = -SlotSize; - -  if (HasFP) { -    // Calculate required stack adjustment. -    uint64_t FrameSize = StackSize - SlotSize; -    // If required, include space for extra hidden slot for stashing base pointer. -    if (X86FI->getRestoreBasePointer()) -      FrameSize += SlotSize; -    if (RegInfo->needsStackRealignment(MF)) { -      // Callee-saved registers are pushed on stack before the stack -      // is realigned. -      FrameSize -= X86FI->getCalleeSavedFrameSize(); -      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; -    } else { -      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); -    } - -    // Get the offset of the stack slot for the EBP register, which is -    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. -    // Update the frame offset adjustment. -    MFI->setOffsetAdjustment(-NumBytes); - -    // Save EBP/RBP into the appropriate stack slot. -    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) -      .addReg(MachineFramePtr, RegState::Kill) -      .setMIFlag(MachineInstr::FrameSetup); - -    if (NeedsDwarfCFI) { -      // Mark the place where EBP/RBP was saved. -      // Define the current CFA rule to use the provided offset. -      assert(StackSize); -      unsigned CFIIndex = MMI.addFrameInst( -          MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); -      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -          .addCFIIndex(CFIIndex); - -      // Change the rule for the FramePtr to be an "offset" rule. -      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); -      CFIIndex = MMI.addFrameInst( -          MCCFIInstruction::createOffset(nullptr, -                                         DwarfFramePtr, 2 * stackGrowth)); -      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -          .addCFIIndex(CFIIndex); -    } - -    if (NeedsWinEH) { -      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) -          .addImm(FramePtr) -          .setMIFlag(MachineInstr::FrameSetup); -    } - -    // Update EBP with the new base value. -    BuildMI(MBB, MBBI, DL, -            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) -        .addReg(StackPtr) -        .setMIFlag(MachineInstr::FrameSetup); - -    if (NeedsDwarfCFI) { -      // Mark effective beginning of when frame pointer becomes valid. -      // Define the current CFA to use the EBP/RBP register. -      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); -      unsigned CFIIndex = MMI.addFrameInst( -          MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); -      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -          .addCFIIndex(CFIIndex); -    } - -    // Mark the FramePtr as live-in in every block. -    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) -      I->addLiveIn(MachineFramePtr); -  } else { -    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); -  } - -  // Skip the callee-saved push instructions. -  bool PushedRegs = false; -  int StackOffset = 2 * stackGrowth; - -  while (MBBI != MBB.end() && -         (MBBI->getOpcode() == X86::PUSH32r || -          MBBI->getOpcode() == X86::PUSH64r)) { -    PushedRegs = true; -    unsigned Reg = MBBI->getOperand(0).getReg(); -    ++MBBI; - -    if (!HasFP && NeedsDwarfCFI) { -      // Mark callee-saved push instruction. -      // Define the current CFA rule to use the provided offset. -      assert(StackSize); -      unsigned CFIIndex = MMI.addFrameInst( -          MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); -      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -          .addCFIIndex(CFIIndex); -      StackOffset += stackGrowth; -    } - -    if (NeedsWinEH) { -      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( -          MachineInstr::FrameSetup); -    } -  } - -  // Realign stack after we pushed callee-saved registers (so that we'll be -  // able to calculate their offsets from the frame pointer). -  if (RegInfo->needsStackRealignment(MF)) { -    assert(HasFP && "There should be a frame pointer if stack is realigned."); -    uint64_t Val = -MaxAlign; -    MachineInstr *MI = -      BuildMI(MBB, MBBI, DL, -              TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) -      .addReg(StackPtr) -      .addImm(Val) -      .setMIFlag(MachineInstr::FrameSetup); - -    // The EFLAGS implicit def is dead. -    MI->getOperand(3).setIsDead(); -  } - -  // If there is an SUB32ri of ESP immediately before this instruction, merge -  // the two. This can be the case when tail call elimination is enabled and -  // the callee has more arguments then the caller. -  NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); - -  // If there is an ADD32ri or SUB32ri of ESP immediately after this -  // instruction, merge the two instructions. -  mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); - -  // Adjust stack pointer: ESP -= numbytes. - -  // Windows and cygwin/mingw require a prologue helper routine when allocating -  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw -  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the -  // stack and adjust the stack pointer in one go.  The 64-bit version of -  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is -  // responsible for adjusting the stack pointer.  Touching the stack at 4K -  // increments is necessary to ensure that the guard pages used by the OS -  // virtual memory manager are allocated in correct sequence. -  if (NumBytes >= StackProbeSize && UseStackProbe) { -    // Check whether EAX is livein for this function. -    bool isEAXAlive = isEAXLiveIn(MF); - -    if (isEAXAlive) { -      // Sanity check that EAX is not livein for this function. -      // It should not be, so throw an assert. -      assert(!Is64Bit && "EAX is livein in x64 case!"); - -      // Save EAX -      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) -        .addReg(X86::EAX, RegState::Kill) -        .setMIFlag(MachineInstr::FrameSetup); -    } - -    if (Is64Bit) { -      // Handle the 64-bit Windows ABI case where we need to call __chkstk. -      // Function prologue is responsible for adjusting the stack pointer. -      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) -        .addImm(NumBytes) -        .setMIFlag(MachineInstr::FrameSetup); -    } else { -      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. -      // We'll also use 4 already allocated bytes for EAX. -      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) -        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) -        .setMIFlag(MachineInstr::FrameSetup); -    } - -    // Save a pointer to the MI where we set AX. -    MachineBasicBlock::iterator SetRAX = MBBI; -    --SetRAX; - -    // Call __chkstk, __chkstk_ms, or __alloca. -    emitStackProbeCall(MF, MBB, MBBI, DL); - -    // Apply the frame setup flag to all inserted instrs. -    for (; SetRAX != MBBI; ++SetRAX) -      SetRAX->setFlag(MachineInstr::FrameSetup); - -    if (isEAXAlive) { -      // Restore EAX -      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), -                                              X86::EAX), -                                      StackPtr, false, NumBytes - 4); -      MI->setFlag(MachineInstr::FrameSetup); -      MBB.insert(MBBI, MI); -    } -  } else if (NumBytes) { -    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, -                 UseLEA, TII, *RegInfo); -  } - -  int SEHFrameOffset = 0; -  if (NeedsWinEH) { -    if (HasFP) { -      // We need to set frame base offset low enough such that all saved -      // register offsets would be positive relative to it, but we can't -      // just use NumBytes, because .seh_setframe offset must be <=240. -      // So we pretend to have only allocated enough space to spill the -      // non-volatile registers. -      // We don't care about the rest of stack allocation, because unwinder -      // will restore SP to (BP - SEHFrameOffset) -      for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { -        int offset = MFI->getObjectOffset(Info.getFrameIdx()); -        SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); -      } -      SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant - -      // This only needs to account for XMM spill slots, GPR slots -      // are covered by the .seh_pushreg's emitted above. -      unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); -      if (Size) { -        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) -            .addImm(Size) -            .setMIFlag(MachineInstr::FrameSetup); -      } - -      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) -          .addImm(FramePtr) -          .addImm(SEHFrameOffset) -          .setMIFlag(MachineInstr::FrameSetup); -    } else { -      // SP will be the base register for restoring XMMs -      if (NumBytes) { -        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) -            .addImm(NumBytes) -            .setMIFlag(MachineInstr::FrameSetup); -      } -    } -  } - -  // Skip the rest of register spilling code -  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) -    ++MBBI; - -  // Emit SEH info for non-GPRs -  if (NeedsWinEH) { -    for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { -      unsigned Reg = Info.getReg(); -      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) -        continue; -      assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); - -      int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); -      Offset += SEHFrameOffset; - -      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) -          .addImm(Reg) -          .addImm(Offset) -          .setMIFlag(MachineInstr::FrameSetup); -    } - -    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) -        .setMIFlag(MachineInstr::FrameSetup); -  } - -  // If we need a base pointer, set it up here. It's whatever the value -  // of the stack pointer is at this point. Any variable size objects -  // will be allocated after this, so we can still use the base pointer -  // to reference locals. -  if (RegInfo->hasBasePointer(MF)) { -    // Update the base pointer with the current stack pointer. -    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; -    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) -      .addReg(StackPtr) -      .setMIFlag(MachineInstr::FrameSetup); -    if (X86FI->getRestoreBasePointer()) { -      // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain. -      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; -      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), -                   FramePtr, true, X86FI->getRestoreBasePointerOffset()) -        .addReg(StackPtr) -        .setMIFlag(MachineInstr::FrameSetup); -    } -  } - -  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { -    // Mark end of stack pointer adjustment. -    if (!HasFP && NumBytes) { -      // Define the current CFA rule to use the provided offset. -      assert(StackSize); -      unsigned CFIIndex = MMI.addFrameInst( -          MCCFIInstruction::createDefCfaOffset(nullptr, -                                               -StackSize + stackGrowth)); - -      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) -          .addCFIIndex(CFIIndex); -    } - -    // Emit DWARF info specifying the offsets of the callee-saved registers. -    if (PushedRegs) -      emitCalleeSavedFrameMoves(MBB, MBBI, DL); -  } -} - -void X86FrameLowering::emitEpilogue(MachineFunction &MF, -                                    MachineBasicBlock &MBB) const { -  const MachineFrameInfo *MFI = MF.getFrameInfo(); -  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); -  assert(MBBI != MBB.end() && "Returning block has no instructions"); -  unsigned RetOpcode = MBBI->getOpcode(); -  DebugLoc DL = MBBI->getDebugLoc(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  bool Is64Bit = STI.is64Bit(); -  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. -  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); -  const bool Is64BitILP32 = STI.isTarget64BitILP32(); -  bool UseLEA = STI.useLeaForSP(); -  unsigned StackAlign = getStackAlignment(); -  unsigned SlotSize = RegInfo->getSlotSize(); -  unsigned FramePtr = RegInfo->getFrameRegister(MF); -  unsigned MachineFramePtr = Is64BitILP32 ? -             getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; -  unsigned StackPtr = RegInfo->getStackRegister(); - -  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); -  bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); - -  switch (RetOpcode) { -  default: -    llvm_unreachable("Can only insert epilog into returning blocks"); -  case X86::RETQ: -  case X86::RETL: -  case X86::RETIL: -  case X86::RETIQ: -  case X86::TCRETURNdi: -  case X86::TCRETURNri: -  case X86::TCRETURNmi: -  case X86::TCRETURNdi64: -  case X86::TCRETURNri64: -  case X86::TCRETURNmi64: -  case X86::EH_RETURN: -  case X86::EH_RETURN64: -    break;  // These are ok -  } - -  // Get the number of bytes to allocate from the FrameInfo. -  uint64_t StackSize = MFI->getStackSize(); -  uint64_t MaxAlign  = MFI->getMaxAlignment(); -  unsigned CSSize = X86FI->getCalleeSavedFrameSize(); -  uint64_t NumBytes = 0; - -  // If we're forcing a stack realignment we can't rely on just the frame -  // info, we need to know the ABI stack alignment as well in case we -  // have a call out.  Otherwise just make sure we have some alignment - we'll -  // go with the minimum. -  if (ForceStackAlign) { -    if (MFI->hasCalls()) -      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; -    else -      MaxAlign = MaxAlign ? MaxAlign : 4; -  } - -  if (hasFP(MF)) { -    // Calculate required stack adjustment. -    uint64_t FrameSize = StackSize - SlotSize; -    if (RegInfo->needsStackRealignment(MF)) { -      // Callee-saved registers were pushed on stack before the stack -      // was realigned. -      FrameSize -= CSSize; -      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; -    } else { -      NumBytes = FrameSize - CSSize; -    } - -    // Pop EBP. -    BuildMI(MBB, MBBI, DL, -            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); -  } else { -    NumBytes = StackSize - CSSize; -  } - -  // Skip the callee-saved pop instructions. -  while (MBBI != MBB.begin()) { -    MachineBasicBlock::iterator PI = std::prev(MBBI); -    unsigned Opc = PI->getOpcode(); - -    if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && -        !PI->isTerminator()) -      break; - -    --MBBI; -  } -  MachineBasicBlock::iterator FirstCSPop = MBBI; - -  DL = MBBI->getDebugLoc(); - -  // If there is an ADD32ri or SUB32ri of ESP immediately before this -  // instruction, merge the two instructions. -  if (NumBytes || MFI->hasVarSizedObjects()) -    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); - -  // If dynamic alloca is used, then reset esp to point to the last callee-saved -  // slot before popping them off! Same applies for the case, when stack was -  // realigned. -  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { -    if (RegInfo->needsStackRealignment(MF)) -      MBBI = FirstCSPop; -    if (CSSize != 0) { -      unsigned Opc = getLEArOpcode(Uses64BitFramePtr); -      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), -                   FramePtr, false, -CSSize); -      --MBBI; -    } else { -      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); -      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) -        .addReg(FramePtr); -      --MBBI; -    } -  } else if (NumBytes) { -    // Adjust stack pointer back: ESP += numbytes. -    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, -                 TII, *RegInfo); -    --MBBI; -  } - -  // Windows unwinder will not invoke function's exception handler if IP is -  // either in prologue or in epilogue.  This behavior causes a problem when a -  // call immediately precedes an epilogue, because the return address points -  // into the epilogue.  To cope with that, we insert an epilogue marker here, -  // then replace it with a 'nop' if it ends up immediately after a CALL in the -  // final emitted code. -  if (NeedsWinEH) -    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); - -  // We're returning from function via eh_return. -  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { -    MBBI = MBB.getLastNonDebugInstr(); -    MachineOperand &DestAddr  = MBBI->getOperand(0); -    assert(DestAddr.isReg() && "Offset should be in register!"); -    BuildMI(MBB, MBBI, DL, -            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), -            StackPtr).addReg(DestAddr.getReg()); -  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || -             RetOpcode == X86::TCRETURNmi || -             RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || -             RetOpcode == X86::TCRETURNmi64) { -    bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; -    // Tail call return: adjust the stack pointer and jump to callee. -    MBBI = MBB.getLastNonDebugInstr(); -    MachineOperand &JumpTarget = MBBI->getOperand(0); -    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); -    assert(StackAdjust.isImm() && "Expecting immediate value."); - -    // Adjust stack pointer. -    int StackAdj = StackAdjust.getImm(); -    int MaxTCDelta = X86FI->getTCReturnAddrDelta(); -    int Offset = 0; -    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); - -    // Incoporate the retaddr area. -    Offset = StackAdj-MaxTCDelta; -    assert(Offset >= 0 && "Offset should never be negative"); - -    if (Offset) { -      // Check for possible merge with preceding ADD instruction. -      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); -      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, -                   UseLEA, TII, *RegInfo); -    } - -    // Jump to label or value in register. -    bool IsWin64 = STI.isTargetWin64(); -    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { -      unsigned Op = (RetOpcode == X86::TCRETURNdi) -                        ? X86::TAILJMPd -                        : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); -      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); -      if (JumpTarget.isGlobal()) -        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), -                             JumpTarget.getTargetFlags()); -      else { -        assert(JumpTarget.isSymbol()); -        MIB.addExternalSymbol(JumpTarget.getSymbolName(), -                              JumpTarget.getTargetFlags()); -      } -    } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { -      unsigned Op = (RetOpcode == X86::TCRETURNmi) -                        ? X86::TAILJMPm -                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); -      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); -      for (unsigned i = 0; i != 5; ++i) -        MIB.addOperand(MBBI->getOperand(i)); -    } else if (RetOpcode == X86::TCRETURNri64) { -      BuildMI(MBB, MBBI, DL, -              TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) -          .addReg(JumpTarget.getReg(), RegState::Kill); -    } else { -      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). -        addReg(JumpTarget.getReg(), RegState::Kill); -    } - -    MachineInstr *NewMI = std::prev(MBBI); -    NewMI->copyImplicitOps(MF, MBBI); - -    // Delete the pseudo instruction TCRETURN. -    MBB.erase(MBBI); -  } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL || -              RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) && -             (X86FI->getTCReturnAddrDelta() < 0)) { -    // Add the return addr area delta back since we are not tail calling. -    int delta = -1*X86FI->getTCReturnAddrDelta(); -    MBBI = MBB.getLastNonDebugInstr(); - -    // Check for possible merge with preceding ADD instruction. -    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); -    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, -                 *RegInfo); -  } -} - -int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, -                                          int FI) const { -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  const MachineFrameInfo *MFI = MF.getFrameInfo(); -  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); -  uint64_t StackSize = MFI->getStackSize(); - -  if (RegInfo->hasBasePointer(MF)) { -    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); -    if (FI < 0) { -      // Skip the saved EBP. -      return Offset + RegInfo->getSlotSize(); -    } else { -      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); -      return Offset + StackSize; -    } -  } else if (RegInfo->needsStackRealignment(MF)) { -    if (FI < 0) { -      // Skip the saved EBP. -      return Offset + RegInfo->getSlotSize(); -    } else { -      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); -      return Offset + StackSize; -    } -    // FIXME: Support tail calls -  } else { -    if (!hasFP(MF)) -      return Offset + StackSize; - -    // Skip the saved EBP. -    Offset += RegInfo->getSlotSize(); - -    // Skip the RETADDR move area -    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); -    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); -    if (TailCallReturnAddrDelta < 0) -      Offset -= TailCallReturnAddrDelta; -  } - -  return Offset; -} - -int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, -                                             unsigned &FrameReg) const { -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  // We can't calculate offset from frame pointer if the stack is realigned, -  // so enforce usage of stack/base pointer.  The base pointer is used when we -  // have dynamic allocas in addition to dynamic realignment. -  if (RegInfo->hasBasePointer(MF)) -    FrameReg = RegInfo->getBaseRegister(); -  else if (RegInfo->needsStackRealignment(MF)) -    FrameReg = RegInfo->getStackRegister(); -  else -    FrameReg = RegInfo->getFrameRegister(MF); -  return getFrameIndexOffset(MF, FI); -} - -// Simplified from getFrameIndexOffset keeping only StackPointer cases -int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { -  const MachineFrameInfo *MFI = MF.getFrameInfo(); -  // Does not include any dynamic realign. -  const uint64_t StackSize = MFI->getStackSize(); -  { -#ifndef NDEBUG -    const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); -    // Note: LLVM arranges the stack as: -    // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) -    //      > "Stack Slots" (<--SP) -    // We can always address StackSlots from RSP.  We can usually (unless -    // needsStackRealignment) address CSRs from RSP, but sometimes need to -    // address them from RBP.  FixedObjects can be placed anywhere in the stack -    // frame depending on their specific requirements (i.e. we can actually -    // refer to arguments to the function which are stored in the *callers* -    // frame).  As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs -    // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. - -    assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); - -    // We don't handle tail calls, and shouldn't be seeing them -    // either. -    int TailCallReturnAddrDelta = -        MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); -    assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); -#endif -  } - -  // This is how the math works out: -  // -  //  %rsp grows (i.e. gets lower) left to right. Each box below is -  //  one word (eight bytes).  Obj0 is the stack slot we're trying to -  //  get to. -  // -  //    ---------------------------------- -  //    | BP | Obj0 | Obj1 | ... | ObjN | -  //    ---------------------------------- -  //    ^    ^      ^                   ^ -  //    A    B      C                   E -  // -  // A is the incoming stack pointer. -  // (B - A) is the local area offset (-8 for x86-64) [1] -  // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] -  // -  // |(E - B)| is the StackSize (absolute value, positive).  For a -  // stack that grown down, this works out to be (B - E). [3] -  // -  // E is also the value of %rsp after stack has been set up, and we -  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now -  // (C - E) == (C - A) - (B - A) + (B - E) -  //            { Using [1], [2] and [3] above } -  //         == getObjectOffset - LocalAreaOffset + StackSize -  // - -  // Get the Offset from the StackPointer -  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); - -  return Offset + StackSize; -} -// Simplified from getFrameIndexReference keeping only StackPointer cases -int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, -                                                  unsigned &FrameReg) const { -  const X86RegisterInfo *RegInfo = -    static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); - -  assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); - -  FrameReg = RegInfo->getStackRegister(); -  return getFrameIndexOffsetFromSP(MF, FI); -} - -bool X86FrameLowering::assignCalleeSavedSpillSlots( -    MachineFunction &MF, const TargetRegisterInfo *TRI, -    std::vector<CalleeSavedInfo> &CSI) const { -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  unsigned SlotSize = RegInfo->getSlotSize(); -  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - -  unsigned CalleeSavedFrameSize = 0; -  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); - -  if (hasFP(MF)) { -    // emitPrologue always spills frame register the first thing. -    SpillSlotOffset -= SlotSize; -    MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); - -    // Since emitPrologue and emitEpilogue will handle spilling and restoring of -    // the frame register, we can delete it from CSI list and not have to worry -    // about avoiding it later. -    unsigned FPReg = RegInfo->getFrameRegister(MF); -    for (unsigned i = 0; i < CSI.size(); ++i) { -      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { -        CSI.erase(CSI.begin() + i); -        break; -      } -    } -  } - -  // Assign slots for GPRs. It increases frame size. -  for (unsigned i = CSI.size(); i != 0; --i) { -    unsigned Reg = CSI[i - 1].getReg(); - -    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) -      continue; - -    SpillSlotOffset -= SlotSize; -    CalleeSavedFrameSize += SlotSize; - -    int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); -    CSI[i - 1].setFrameIdx(SlotIndex); -  } - -  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); - -  // Assign slots for XMMs. -  for (unsigned i = CSI.size(); i != 0; --i) { -    unsigned Reg = CSI[i - 1].getReg(); -    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) -      continue; - -    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); -    // ensure alignment -    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); -    // spill into slot -    SpillSlotOffset -= RC->getSize(); -    int SlotIndex = -        MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); -    CSI[i - 1].setFrameIdx(SlotIndex); -    MFI->ensureMaxAlignment(RC->getAlignment()); -  } - -  return true; -} - -bool X86FrameLowering::spillCalleeSavedRegisters( -    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, -    const std::vector<CalleeSavedInfo> &CSI, -    const TargetRegisterInfo *TRI) const { -  DebugLoc DL = MBB.findDebugLoc(MI); - -  MachineFunction &MF = *MBB.getParent(); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); - -  // Push GPRs. It increases frame size. -  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; -  for (unsigned i = CSI.size(); i != 0; --i) { -    unsigned Reg = CSI[i - 1].getReg(); - -    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) -      continue; -    // Add the callee-saved register as live-in. It's killed at the spill. -    MBB.addLiveIn(Reg); - -    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) -      .setMIFlag(MachineInstr::FrameSetup); -  } - -  // Make XMM regs spilled. X86 does not have ability of push/pop XMM. -  // It can be done by spilling XMMs to stack frame. -  for (unsigned i = CSI.size(); i != 0; --i) { -    unsigned Reg = CSI[i-1].getReg(); -    if (X86::GR64RegClass.contains(Reg) || -        X86::GR32RegClass.contains(Reg)) -      continue; -    // Add the callee-saved register as live-in. It's killed at the spill. -    MBB.addLiveIn(Reg); -    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - -    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, -                            TRI); -    --MI; -    MI->setFlag(MachineInstr::FrameSetup); -    ++MI; -  } - -  return true; -} - -bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, -                                               MachineBasicBlock::iterator MI, -                                        const std::vector<CalleeSavedInfo> &CSI, -                                          const TargetRegisterInfo *TRI) const { -  if (CSI.empty()) -    return false; - -  DebugLoc DL = MBB.findDebugLoc(MI); - -  MachineFunction &MF = *MBB.getParent(); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); - -  // Reload XMMs from stack frame. -  for (unsigned i = 0, e = CSI.size(); i != e; ++i) { -    unsigned Reg = CSI[i].getReg(); -    if (X86::GR64RegClass.contains(Reg) || -        X86::GR32RegClass.contains(Reg)) -      continue; - -    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); -    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); -  } - -  // POP GPRs. -  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; -  for (unsigned i = 0, e = CSI.size(); i != e; ++i) { -    unsigned Reg = CSI[i].getReg(); -    if (!X86::GR64RegClass.contains(Reg) && -        !X86::GR32RegClass.contains(Reg)) -      continue; - -    BuildMI(MBB, MI, DL, TII.get(Opc), Reg); -  } -  return true; -} - -void -X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, -                                                       RegScavenger *RS) const { -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  const X86RegisterInfo *RegInfo = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); -  unsigned SlotSize = RegInfo->getSlotSize(); - -  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); -  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - -  if (TailCallReturnAddrDelta < 0) { -    // create RETURNADDR area -    //   arg -    //   arg -    //   RETADDR -    //   { ... -    //     RETADDR area -    //     ... -    //   } -    //   [EBP] -    MFI->CreateFixedObject(-TailCallReturnAddrDelta, -                           TailCallReturnAddrDelta - SlotSize, true); -  } - -  // Spill the BasePtr if it's used. -  if (RegInfo->hasBasePointer(MF)) -    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); -} - -static bool -HasNestArgument(const MachineFunction *MF) { -  const Function *F = MF->getFunction(); -  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); -       I != E; I++) { -    if (I->hasNestAttr()) -      return true; -  } -  return false; -} - -/// GetScratchRegister - Get a temp register for performing work in the -/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform -/// and the properties of the function either one or two registers will be -/// needed. Set primary to true for the first register, false for the second. -static unsigned -GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { -  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); - -  // Erlang stuff. -  if (CallingConvention == CallingConv::HiPE) { -    if (Is64Bit) -      return Primary ? X86::R14 : X86::R13; -    else -      return Primary ? X86::EBX : X86::EDI; -  } - -  if (Is64Bit) { -    if (IsLP64) -      return Primary ? X86::R11 : X86::R12; -    else -      return Primary ? X86::R11D : X86::R12D; -  } - -  bool IsNested = HasNestArgument(&MF); - -  if (CallingConvention == CallingConv::X86_FastCall || -      CallingConvention == CallingConv::Fast) { -    if (IsNested) -      report_fatal_error("Segmented stacks does not support fastcall with " -                         "nested function."); -    return Primary ? X86::EAX : X86::ECX; -  } -  if (IsNested) -    return Primary ? X86::EDX : X86::EAX; -  return Primary ? X86::ECX : X86::EAX; -} - -// The stack limit in the TCB is set to this many bytes above the actual stack -// limit. -static const uint64_t kSplitStackAvailable = 256; - -void -X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { -  MachineBasicBlock &prologueMBB = MF.front(); -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  uint64_t StackSize; -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  bool Is64Bit = STI.is64Bit(); -  const bool IsLP64 = STI.isTarget64BitLP64(); -  unsigned TlsReg, TlsOffset; -  DebugLoc DL; - -  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); -  assert(!MF.getRegInfo().isLiveIn(ScratchReg) && -         "Scratch register is live-in"); - -  if (MF.getFunction()->isVarArg()) -    report_fatal_error("Segmented stacks do not support vararg functions."); -  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && -      !STI.isTargetWin64() && !STI.isTargetFreeBSD() && -      !STI.isTargetDragonFly()) -    report_fatal_error("Segmented stacks not supported on this platform."); - -  // Eventually StackSize will be calculated by a link-time pass; which will -  // also decide whether checking code needs to be injected into this particular -  // prologue. -  StackSize = MFI->getStackSize(); - -  // Do not generate a prologue for functions with a stack of size zero -  if (StackSize == 0) -    return; - -  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); -  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); -  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); -  bool IsNested = false; - -  // We need to know if the function has a nest argument only in 64 bit mode. -  if (Is64Bit) -    IsNested = HasNestArgument(&MF); - -  // The MOV R10, RAX needs to be in a different block, since the RET we emit in -  // allocMBB needs to be last (terminating) instruction. - -  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), -         e = prologueMBB.livein_end(); i != e; i++) { -    allocMBB->addLiveIn(*i); -    checkMBB->addLiveIn(*i); -  } - -  if (IsNested) -    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); - -  MF.push_front(allocMBB); -  MF.push_front(checkMBB); - -  // When the frame size is less than 256 we just compare the stack -  // boundary directly to the value of the stack pointer, per gcc. -  bool CompareStackPointer = StackSize < kSplitStackAvailable; - -  // Read the limit off the current stacklet off the stack_guard location. -  if (Is64Bit) { -    if (STI.isTargetLinux()) { -      TlsReg = X86::FS; -      TlsOffset = IsLP64 ? 0x70 : 0x40; -    } else if (STI.isTargetDarwin()) { -      TlsReg = X86::GS; -      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. -    } else if (STI.isTargetWin64()) { -      TlsReg = X86::GS; -      TlsOffset = 0x28; // pvArbitrary, reserved for application use -    } else if (STI.isTargetFreeBSD()) { -      TlsReg = X86::FS; -      TlsOffset = 0x18; -    } else if (STI.isTargetDragonFly()) { -      TlsReg = X86::FS; -      TlsOffset = 0x20; // use tls_tcb.tcb_segstack -    } else { -      report_fatal_error("Segmented stacks not supported on this platform."); -    } - -    if (CompareStackPointer) -      ScratchReg = IsLP64 ? X86::RSP : X86::ESP; -    else -      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) -        .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - -    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) -      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); -  } else { -    if (STI.isTargetLinux()) { -      TlsReg = X86::GS; -      TlsOffset = 0x30; -    } else if (STI.isTargetDarwin()) { -      TlsReg = X86::GS; -      TlsOffset = 0x48 + 90*4; -    } else if (STI.isTargetWin32()) { -      TlsReg = X86::FS; -      TlsOffset = 0x14; // pvArbitrary, reserved for application use -    } else if (STI.isTargetDragonFly()) { -      TlsReg = X86::FS; -      TlsOffset = 0x10; // use tls_tcb.tcb_segstack -    } else if (STI.isTargetFreeBSD()) { -      report_fatal_error("Segmented stacks not supported on FreeBSD i386."); -    } else { -      report_fatal_error("Segmented stacks not supported on this platform."); -    } - -    if (CompareStackPointer) -      ScratchReg = X86::ESP; -    else -      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) -        .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - -    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || -        STI.isTargetDragonFly()) { -      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) -        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); -    } else if (STI.isTargetDarwin()) { - -      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. -      unsigned ScratchReg2; -      bool SaveScratch2; -      if (CompareStackPointer) { -        // The primary scratch register is available for holding the TLS offset. -        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); -        SaveScratch2 = false; -      } else { -        // Need to use a second register to hold the TLS offset -        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); - -        // Unfortunately, with fastcc the second scratch register may hold an -        // argument. -        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); -      } - -      // If Scratch2 is live-in then it needs to be saved. -      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && -             "Scratch register is live-in and not saved"); - -      if (SaveScratch2) -        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) -          .addReg(ScratchReg2, RegState::Kill); - -      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) -        .addImm(TlsOffset); -      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) -        .addReg(ScratchReg) -        .addReg(ScratchReg2).addImm(1).addReg(0) -        .addImm(0) -        .addReg(TlsReg); - -      if (SaveScratch2) -        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); -    } -  } - -  // This jump is taken if SP >= (Stacklet Limit + Stack Space required). -  // It jumps to normal execution of the function body. -  BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); - -  // On 32 bit we first push the arguments size and then the frame size. On 64 -  // bit, we pass the stack frame size in r10 and the argument size in r11. -  if (Is64Bit) { -    // Functions with nested arguments use R10, so it needs to be saved across -    // the call to _morestack - -    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; -    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; -    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; -    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; -    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; - -    if (IsNested) -      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); - -    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) -      .addImm(StackSize); -    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) -      .addImm(X86FI->getArgumentStackSize()); -    MF.getRegInfo().setPhysRegUsed(Reg10); -    MF.getRegInfo().setPhysRegUsed(Reg11); -  } else { -    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) -      .addImm(X86FI->getArgumentStackSize()); -    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) -      .addImm(StackSize); -  } - -  // __morestack is in libgcc -  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { -    // Under the large code model, we cannot assume that __morestack lives -    // within 2^31 bytes of the call site, so we cannot use pc-relative -    // addressing. We cannot perform the call via a temporary register, -    // as the rax register may be used to store the static chain, and all -    // other suitable registers may be either callee-save or used for -    // parameter passing. We cannot use the stack at this point either -    // because __morestack manipulates the stack directly. -    // -    // To avoid these issues, perform an indirect call via a read-only memory -    // location containing the address. -    // -    // This solution is not perfect, as it assumes that the .rodata section -    // is laid out within 2^31 bytes of each function body, but this seems -    // to be sufficient for JIT. -    BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) -        .addReg(X86::RIP) -        .addImm(0) -        .addReg(0) -        .addExternalSymbol("__morestack_addr") -        .addReg(0); -    MF.getMMI().setUsesMorestackAddr(true); -  } else { -    if (Is64Bit) -      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) -        .addExternalSymbol("__morestack"); -    else -      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) -        .addExternalSymbol("__morestack"); -  } - -  if (IsNested) -    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); -  else -    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); - -  allocMBB->addSuccessor(&prologueMBB); - -  checkMBB->addSuccessor(allocMBB); -  checkMBB->addSuccessor(&prologueMBB); - -#ifdef XDEBUG -  MF.verify(); -#endif -} - -/// Erlang programs may need a special prologue to handle the stack size they -/// might need at runtime. That is because Erlang/OTP does not implement a C -/// stack but uses a custom implementation of hybrid stack/heap architecture. -/// (for more information see Eric Stenman's Ph.D. thesis: -/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) -/// -/// CheckStack: -///       temp0 = sp - MaxStack -///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart -/// OldStart: -///       ... -/// IncStack: -///       call inc_stack   # doubles the stack space -///       temp0 = sp - MaxStack -///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart -void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  MachineFrameInfo *MFI = MF.getFrameInfo(); -  const unsigned SlotSize = -      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) -          ->getSlotSize(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  const bool Is64Bit = STI.is64Bit(); -  const bool IsLP64 = STI.isTarget64BitLP64(); -  DebugLoc DL; -  // HiPE-specific values -  const unsigned HipeLeafWords = 24; -  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; -  const unsigned Guaranteed = HipeLeafWords * SlotSize; -  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? -                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0; -  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; - -  assert(STI.isTargetLinux() && -         "HiPE prologue is only supported on Linux operating systems."); - -  // Compute the largest caller's frame that is needed to fit the callees' -  // frames. This 'MaxStack' is computed from: -  // -  // a) the fixed frame size, which is the space needed for all spilled temps, -  // b) outgoing on-stack parameter areas, and -  // c) the minimum stack space this function needs to make available for the -  //    functions it calls (a tunable ABI property). -  if (MFI->hasCalls()) { -    unsigned MoreStackForCalls = 0; - -    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); -         MBBI != MBBE; ++MBBI) -      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); -           MI != ME; ++MI) { -        if (!MI->isCall()) -          continue; - -        // Get callee operand. -        const MachineOperand &MO = MI->getOperand(0); - -        // Only take account of global function calls (no closures etc.). -        if (!MO.isGlobal()) -          continue; - -        const Function *F = dyn_cast<Function>(MO.getGlobal()); -        if (!F) -          continue; - -        // Do not update 'MaxStack' for primitive and built-in functions -        // (encoded with names either starting with "erlang."/"bif_" or not -        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an -        // "_", such as the BIF "suspend_0") as they are executed on another -        // stack. -        if (F->getName().find("erlang.") != StringRef::npos || -            F->getName().find("bif_") != StringRef::npos || -            F->getName().find_first_of("._") == StringRef::npos) -          continue; - -        unsigned CalleeStkArity = -          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; -        if (HipeLeafWords - 1 > CalleeStkArity) -          MoreStackForCalls = std::max(MoreStackForCalls, -                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); -      } -    MaxStack += MoreStackForCalls; -  } - -  // If the stack frame needed is larger than the guaranteed then runtime checks -  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. -  if (MaxStack > Guaranteed) { -    MachineBasicBlock &prologueMBB = MF.front(); -    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); -    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - -    for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), -           E = prologueMBB.livein_end(); I != E; I++) { -      stackCheckMBB->addLiveIn(*I); -      incStackMBB->addLiveIn(*I); -    } - -    MF.push_front(incStackMBB); -    MF.push_front(stackCheckMBB); - -    unsigned ScratchReg, SPReg, PReg, SPLimitOffset; -    unsigned LEAop, CMPop, CALLop; -    if (Is64Bit) { -      SPReg = X86::RSP; -      PReg  = X86::RBP; -      LEAop = X86::LEA64r; -      CMPop = X86::CMP64rm; -      CALLop = X86::CALL64pcrel32; -      SPLimitOffset = 0x90; -    } else { -      SPReg = X86::ESP; -      PReg  = X86::EBP; -      LEAop = X86::LEA32r; -      CMPop = X86::CMP32rm; -      CALLop = X86::CALLpcrel32; -      SPLimitOffset = 0x4c; -    } - -    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); -    assert(!MF.getRegInfo().isLiveIn(ScratchReg) && -           "HiPE prologue scratch register is live-in"); - -    // Create new MBB for StackCheck: -    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), -                 SPReg, false, -MaxStack); -    // SPLimitOffset is in a fixed heap location (pointed by BP). -    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) -                 .addReg(ScratchReg), PReg, false, SPLimitOffset); -    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); - -    // Create new MBB for IncStack: -    BuildMI(incStackMBB, DL, TII.get(CALLop)). -      addExternalSymbol("inc_stack_0"); -    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), -                 SPReg, false, -MaxStack); -    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) -                 .addReg(ScratchReg), PReg, false, SPLimitOffset); -    BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - -    stackCheckMBB->addSuccessor(&prologueMBB, 99); -    stackCheckMBB->addSuccessor(incStackMBB, 1); -    incStackMBB->addSuccessor(&prologueMBB, 99); -    incStackMBB->addSuccessor(incStackMBB, 1); -  } -#ifdef XDEBUG -  MF.verify(); -#endif -} - -bool X86FrameLowering:: -convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, -                       MachineBasicBlock::iterator I, uint64_t Amount) const { -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( -    MF.getSubtarget().getRegisterInfo()); -  unsigned StackPtr = RegInfo.getStackRegister(); - -  // Scan the call setup sequence for the pattern we're looking for. -  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr -  // instructions, that push a sequence of 32-bit values onto the stack, with -  // no gaps.   -  std::map<int64_t, MachineBasicBlock::iterator> MovMap; -  do { -    int Opcode = I->getOpcode(); -    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) -      break; -  -    // We only want movs of the form: -    // movl imm/r32, k(%ecx) -    // If we run into something else, bail -    // Note that AddrBaseReg may, counterintuitively, not be a register... -    if (!I->getOperand(X86::AddrBaseReg).isReg() ||  -        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || -        !I->getOperand(X86::AddrScaleAmt).isImm() || -        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || -        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || -        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || -        !I->getOperand(X86::AddrDisp).isImm()) -      return false; - -    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); -     -    // We don't want to consider the unaligned case. -    if (StackDisp % 4) -      return false; - -    // If the same stack slot is being filled twice, something's fishy. -    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second) -      return false; - -    ++I; -  } while (I != MBB.end()); - -  // We now expect the end of the sequence - a call and a stack adjust. -  if (I == MBB.end()) -    return false; -  if (!I->isCall()) -    return false; -  MachineBasicBlock::iterator Call = I; -  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) -    return false; - -  // Now, go through the map, and see that we don't have any gaps, -  // but only a series of 32-bit MOVs. -  // Since std::map provides ordered iteration, the original order -  // of the MOVs doesn't matter. -  int64_t ExpectedDist = 0; -  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME;  -       ++MMI, ExpectedDist += 4) -    if (MMI->first != ExpectedDist) -      return false; - -  // Ok, everything looks fine. Do the transformation. -  DebugLoc DL = I->getDebugLoc(); - -  // It's possible the original stack adjustment amount was larger than -  // that done by the pushes. If so, we still need a SUB. -  Amount -= ExpectedDist; -  if (Amount) { -    MachineInstr* Sub = BuildMI(MBB, Call, DL, -                          TII.get(getSUBriOpcode(false, Amount)), StackPtr) -                  .addReg(StackPtr).addImm(Amount); -    Sub->getOperand(3).setIsDead(); -  } - -  // Now, iterate through the map in reverse order, and replace the movs -  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. -  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { -    MachineBasicBlock::iterator MOV = MMI->second; -    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); - -    // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size -    int PushOpcode = X86::PUSH32r; -    if (MOV->getOpcode() == X86::MOV32mi) -      PushOpcode = getPUSHiOpcode(false, PushOp); - -    BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp); -    MBB.erase(MOV); -  } - -  return true; -} - -void X86FrameLowering:: -eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, -                              MachineBasicBlock::iterator I) const { -  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( -                                       MF.getSubtarget().getRegisterInfo()); -  unsigned StackPtr = RegInfo.getStackRegister(); -  bool reserveCallFrame = hasReservedCallFrame(MF); -  int Opcode = I->getOpcode(); -  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); -  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -  bool IsLP64 = STI.isTarget64BitLP64(); -  DebugLoc DL = I->getDebugLoc(); -  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; -  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; -  I = MBB.erase(I); - -  if (!reserveCallFrame) { -    // If the stack pointer can be changed after prologue, turn the -    // adjcallstackup instruction into a 'sub ESP, <amt>' and the -    // adjcallstackdown instruction into 'add ESP, <amt>' -    if (Amount == 0) -      return; - -    // We need to keep the stack aligned properly.  To do this, we round the -    // amount of space needed for the outgoing arguments up to the next -    // alignment boundary. -    unsigned StackAlign = MF.getTarget() -                              .getSubtargetImpl() -                              ->getFrameLowering() -                              ->getStackAlignment(); -    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; - -    MachineInstr *New = nullptr; -    if (Opcode == TII.getCallFrameSetupOpcode()) { -      // Try to convert movs to the stack into pushes. -      // We currently only look for a pattern that appears in 32-bit -      // calling conventions. -      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) -        return; - -      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), -                    StackPtr) -        .addReg(StackPtr) -        .addImm(Amount); -    } else { -      assert(Opcode == TII.getCallFrameDestroyOpcode()); - -      // Factor out the amount the callee already popped. -      Amount -= CalleeAmt; - -      if (Amount) { -        unsigned Opc = getADDriOpcode(IsLP64, Amount); -        New = BuildMI(MF, DL, TII.get(Opc), StackPtr) -          .addReg(StackPtr).addImm(Amount); -      } -    } - -    if (New) { -      // The EFLAGS implicit def is dead. -      New->getOperand(3).setIsDead(); - -      // Replace the pseudo instruction with a new instruction. -      MBB.insert(I, New); -    } - -    return; -  } - -  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { -    // If we are performing frame pointer elimination and if the callee pops -    // something off the stack pointer, add it back.  We do this until we have -    // more advanced stack pointer tracking ability. -    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); -    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) -      .addReg(StackPtr).addImm(CalleeAmt); - -    // The EFLAGS implicit def is dead. -    New->getOperand(3).setIsDead(); - -    // We are not tracking the stack pointer adjustment by the callee, so make -    // sure we restore the stack pointer immediately after the call, there may -    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. -    MachineBasicBlock::iterator B = MBB.begin(); -    while (I != B && !std::prev(I)->isCall()) -      --I; -    MBB.insert(I, New); -  } -} - +//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file contains the X86 implementation of TargetFrameLowering class.
 +//
 +//===----------------------------------------------------------------------===//
 +
 +#include "X86FrameLowering.h"
 +#include "X86InstrBuilder.h"
 +#include "X86InstrInfo.h"
 +#include "X86MachineFunctionInfo.h"
 +#include "X86Subtarget.h"
 +#include "X86TargetMachine.h"
 +#include "llvm/ADT/SmallSet.h"
 +#include "llvm/CodeGen/MachineFrameInfo.h"
 +#include "llvm/CodeGen/MachineFunction.h"
 +#include "llvm/CodeGen/MachineInstrBuilder.h"
 +#include "llvm/CodeGen/MachineModuleInfo.h"
 +#include "llvm/CodeGen/MachineRegisterInfo.h"
 +#include "llvm/IR/DataLayout.h"
 +#include "llvm/IR/Function.h"
 +#include "llvm/MC/MCAsmInfo.h"
 +#include "llvm/MC/MCSymbol.h"
 +#include "llvm/Support/CommandLine.h"
 +#include "llvm/Target/TargetOptions.h"
 +#include "llvm/Support/Debug.h"
 +#include <cstdlib>
 +
 +using namespace llvm;
 +
 +// FIXME: completely move here.
 +extern cl::opt<bool> ForceStackAlign;
 +
 +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 +  return !MF.getFrameInfo()->hasVarSizedObjects() &&
 +         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 +}
 +
 +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
 +/// call frame pseudos can be simplified.  Having a FP, as in the default
 +/// implementation, is not sufficient here since we can't always use it.
 +/// Use a more nuanced condition.
 +bool
 +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
 +  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
 +                               (MF.getSubtarget().getRegisterInfo());
 +  return hasReservedCallFrame(MF) ||
 +         (hasFP(MF) && !TRI->needsStackRealignment(MF))
 +         || TRI->hasBasePointer(MF);
 +}
 +
 +// needsFrameIndexResolution - Do we need to perform FI resolution for
 +// this function. Normally, this is required only when the function
 +// has any stack objects. However, FI resolution actually has another job,
 +// not apparent from the title - it resolves callframesetup/destroy 
 +// that were not simplified earlier.
 +// So, this is required for x86 functions that have push sequences even
 +// when there are no stack objects.
 +bool
 +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
 +  return MF.getFrameInfo()->hasStackObjects() ||
 +         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 +}
 +
 +/// hasFP - Return true if the specified function should have a dedicated frame
 +/// pointer register.  This is true if the function has variable sized allocas
 +/// or if frame pointer elimination is disabled.
 +bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
 +  const MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const MachineModuleInfo &MMI = MF.getMMI();
 +  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 +
 +  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
 +          RegInfo->needsStackRealignment(MF) ||
 +          MFI->hasVarSizedObjects() ||
 +          MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
 +          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
 +          MMI.callsUnwindInit() || MMI.callsEHReturn() ||
 +          MFI->hasStackMap() || MFI->hasPatchPoint());
 +}
 +
 +static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
 +  if (IsLP64) {
 +    if (isInt<8>(Imm))
 +      return X86::SUB64ri8;
 +    return X86::SUB64ri32;
 +  } else {
 +    if (isInt<8>(Imm))
 +      return X86::SUB32ri8;
 +    return X86::SUB32ri;
 +  }
 +}
 +
 +static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
 +  if (IsLP64) {
 +    if (isInt<8>(Imm))
 +      return X86::ADD64ri8;
 +    return X86::ADD64ri32;
 +  } else {
 +    if (isInt<8>(Imm))
 +      return X86::ADD32ri8;
 +    return X86::ADD32ri;
 +  }
 +}
 +
 +static unsigned getSUBrrOpcode(unsigned isLP64) {
 +  return isLP64 ? X86::SUB64rr : X86::SUB32rr;
 +}
 +
 +static unsigned getADDrrOpcode(unsigned isLP64) {
 +  return isLP64 ? X86::ADD64rr : X86::ADD32rr;
 +}
 +
 +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
 +  if (IsLP64) {
 +    if (isInt<8>(Imm))
 +      return X86::AND64ri8;
 +    return X86::AND64ri32;
 +  }
 +  if (isInt<8>(Imm))
 +    return X86::AND32ri8;
 +  return X86::AND32ri;
 +}
 +
 +static unsigned getLEArOpcode(unsigned IsLP64) {
 +  return IsLP64 ? X86::LEA64r : X86::LEA32r;
 +}
 +
 +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
 +/// when it reaches the "return" instruction. We can then pop a stack object
 +/// to this register without worry about clobbering it.
 +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
 +                                       MachineBasicBlock::iterator &MBBI,
 +                                       const TargetRegisterInfo &TRI,
 +                                       bool Is64Bit) {
 +  const MachineFunction *MF = MBB.getParent();
 +  const Function *F = MF->getFunction();
 +  if (!F || MF->getMMI().callsEHReturn())
 +    return 0;
 +
 +  static const uint16_t CallerSavedRegs32Bit[] = {
 +    X86::EAX, X86::EDX, X86::ECX, 0
 +  };
 +
 +  static const uint16_t CallerSavedRegs64Bit[] = {
 +    X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
 +    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
 +  };
 +
 +  unsigned Opc = MBBI->getOpcode();
 +  switch (Opc) {
 +  default: return 0;
 +  case X86::RETL:
 +  case X86::RETQ:
 +  case X86::RETIL:
 +  case X86::RETIQ:
 +  case X86::TCRETURNdi:
 +  case X86::TCRETURNri:
 +  case X86::TCRETURNmi:
 +  case X86::TCRETURNdi64:
 +  case X86::TCRETURNri64:
 +  case X86::TCRETURNmi64:
 +  case X86::EH_RETURN:
 +  case X86::EH_RETURN64: {
 +    SmallSet<uint16_t, 8> Uses;
 +    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
 +      MachineOperand &MO = MBBI->getOperand(i);
 +      if (!MO.isReg() || MO.isDef())
 +        continue;
 +      unsigned Reg = MO.getReg();
 +      if (!Reg)
 +        continue;
 +      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
 +        Uses.insert(*AI);
 +    }
 +
 +    const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
 +    for (; *CS; ++CS)
 +      if (!Uses.count(*CS))
 +        return *CS;
 +  }
 +  }
 +
 +  return 0;
 +}
 +
 +static bool isEAXLiveIn(MachineFunction &MF) {
 +  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
 +       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
 +    unsigned Reg = II->first;
 +
 +    if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
 +        Reg == X86::AH || Reg == X86::AL)
 +      return true;
 +  }
 +
 +  return false;
 +}
 +
 +/// emitSPUpdate - Emit a series of instructions to increment / decrement the
 +/// stack pointer by a constant value.
 +static
 +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
 +                  unsigned StackPtr, int64_t NumBytes,
 +                  bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA,
 +                  const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
 +  bool isSub = NumBytes < 0;
 +  uint64_t Offset = isSub ? -NumBytes : NumBytes;
 +  unsigned Opc;
 +  if (UseLEA)
 +    Opc = getLEArOpcode(Is64BitStackPtr);
 +  else
 +    Opc = isSub
 +      ? getSUBriOpcode(Is64BitStackPtr, Offset)
 +      : getADDriOpcode(Is64BitStackPtr, Offset);
 +
 +  uint64_t Chunk = (1LL << 31) - 1;
 +  DebugLoc DL = MBB.findDebugLoc(MBBI);
 +
 +  while (Offset) {
 +    if (Offset > Chunk) {
 +      // Rather than emit a long series of instructions for large offsets,
 +      // load the offset into a register and do one sub/add
 +      unsigned Reg = 0;
 +
 +      if (isSub && !isEAXLiveIn(*MBB.getParent()))
 +        Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX);
 +      else
 +        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
 +
 +      if (Reg) {
 +        Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri;
 +        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
 +          .addImm(Offset);
 +        Opc = isSub
 +          ? getSUBrrOpcode(Is64BitTarget)
 +          : getADDrrOpcode(Is64BitTarget);
 +        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
 +          .addReg(StackPtr)
 +          .addReg(Reg);
 +        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
 +        Offset = 0;
 +        continue;
 +      }
 +    }
 +
 +    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
 +    if (ThisVal == (Is64BitTarget ? 8 : 4)) {
 +      // Use push / pop instead.
 +      unsigned Reg = isSub
 +        ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX)
 +        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
 +      if (Reg) {
 +        Opc = isSub
 +          ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r)
 +          : (Is64BitTarget ? X86::POP64r  : X86::POP32r);
 +        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
 +          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
 +        if (isSub)
 +          MI->setFlag(MachineInstr::FrameSetup);
 +        Offset -= ThisVal;
 +        continue;
 +      }
 +    }
 +
 +    MachineInstr *MI = nullptr;
 +
 +    if (UseLEA) {
 +      MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
 +                          StackPtr, false, isSub ? -ThisVal : ThisVal);
 +    } else {
 +      MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
 +            .addReg(StackPtr)
 +            .addImm(ThisVal);
 +      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
 +    }
 +
 +    if (isSub)
 +      MI->setFlag(MachineInstr::FrameSetup);
 +
 +    Offset -= ThisVal;
 +  }
 +}
 +
 +/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
 +static
 +void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
 +                      unsigned StackPtr, uint64_t *NumBytes = nullptr) {
 +  if (MBBI == MBB.begin()) return;
 +
 +  MachineBasicBlock::iterator PI = std::prev(MBBI);
 +  unsigned Opc = PI->getOpcode();
 +  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
 +       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
 +       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
 +      PI->getOperand(0).getReg() == StackPtr) {
 +    if (NumBytes)
 +      *NumBytes += PI->getOperand(2).getImm();
 +    MBB.erase(PI);
 +  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
 +              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
 +             PI->getOperand(0).getReg() == StackPtr) {
 +    if (NumBytes)
 +      *NumBytes -= PI->getOperand(2).getImm();
 +    MBB.erase(PI);
 +  }
 +}
 +
 +/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
 +/// iterator.
 +static
 +void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 +                        MachineBasicBlock::iterator &MBBI,
 +                        unsigned StackPtr, uint64_t *NumBytes = nullptr) {
 +  // FIXME:  THIS ISN'T RUN!!!
 +  return;
 +
 +  if (MBBI == MBB.end()) return;
 +
 +  MachineBasicBlock::iterator NI = std::next(MBBI);
 +  if (NI == MBB.end()) return;
 +
 +  unsigned Opc = NI->getOpcode();
 +  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
 +       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
 +      NI->getOperand(0).getReg() == StackPtr) {
 +    if (NumBytes)
 +      *NumBytes -= NI->getOperand(2).getImm();
 +    MBB.erase(NI);
 +    MBBI = NI;
 +  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
 +              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
 +             NI->getOperand(0).getReg() == StackPtr) {
 +    if (NumBytes)
 +      *NumBytes += NI->getOperand(2).getImm();
 +    MBB.erase(NI);
 +    MBBI = NI;
 +  }
 +}
 +
 +/// mergeSPUpdates - Checks the instruction before/after the passed
 +/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
 +/// the stack adjustment is returned as a positive value for ADD/LEA and a
 +/// negative for SUB.
 +static int mergeSPUpdates(MachineBasicBlock &MBB,
 +                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
 +                          bool doMergeWithPrevious) {
 +  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
 +      (!doMergeWithPrevious && MBBI == MBB.end()))
 +    return 0;
 +
 +  MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
 +  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
 +                                                       : std::next(MBBI);
 +  unsigned Opc = PI->getOpcode();
 +  int Offset = 0;
 +
 +  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
 +       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
 +       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
 +      PI->getOperand(0).getReg() == StackPtr){
 +    Offset += PI->getOperand(2).getImm();
 +    MBB.erase(PI);
 +    if (!doMergeWithPrevious) MBBI = NI;
 +  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
 +              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
 +             PI->getOperand(0).getReg() == StackPtr) {
 +    Offset -= PI->getOperand(2).getImm();
 +    MBB.erase(PI);
 +    if (!doMergeWithPrevious) MBBI = NI;
 +  }
 +
 +  return Offset;
 +}
 +
 +void
 +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
 +                                            MachineBasicBlock::iterator MBBI,
 +                                            DebugLoc DL) const {
 +  MachineFunction &MF = *MBB.getParent();
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  MachineModuleInfo &MMI = MF.getMMI();
 +  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +
 +  // Add callee saved registers to move list.
 +  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 +  if (CSI.empty()) return;
 +
 +  // Calculate offsets.
 +  for (std::vector<CalleeSavedInfo>::const_iterator
 +         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
 +    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
 +    unsigned Reg = I->getReg();
 +
 +    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
 +    unsigned CFIIndex =
 +        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
 +                                                        Offset));
 +    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +        .addCFIIndex(CFIIndex);
 +  }
 +}
 +
 +/// usesTheStack - This function checks if any of the users of EFLAGS
 +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
 +/// to use the stack, and if we don't adjust the stack we clobber the first
 +/// frame index.
 +/// See X86InstrInfo::copyPhysReg.
 +static bool usesTheStack(const MachineFunction &MF) {
 +  const MachineRegisterInfo &MRI = MF.getRegInfo();
 +
 +  for (MachineRegisterInfo::reg_instr_iterator
 +       ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
 +       ri != re; ++ri)
 +    if (ri->isCopy())
 +      return true;
 +
 +  return false;
 +}
 +
 +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
 +                                          MachineBasicBlock &MBB,
 +                                          MachineBasicBlock::iterator MBBI,
 +                                          DebugLoc DL) {
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  bool Is64Bit = STI.is64Bit();
 +  bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 +
 +  unsigned CallOp;
 +  if (Is64Bit)
 +    CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
 +  else
 +    CallOp = X86::CALLpcrel32;
 +
 +  const char *Symbol;
 +  if (Is64Bit) {
 +    if (STI.isTargetCygMing()) {
 +      Symbol = "___chkstk_ms";
 +    } else {
 +      Symbol = "__chkstk";
 +    }
 +  } else if (STI.isTargetCygMing())
 +    Symbol = "_alloca";
 +  else
 +    Symbol = "_chkstk";
 +
 +  MachineInstrBuilder CI;
 +
 +  // All current stack probes take AX and SP as input, clobber flags, and
 +  // preserve all registers. x86_64 probes leave RSP unmodified.
 +  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
 +    // For the large code model, we have to call through a register. Use R11,
 +    // as it is scratch in all supported calling conventions.
 +    BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
 +        .addExternalSymbol(Symbol);
 +    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
 +  } else {
 +    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
 +  }
 +
 +  unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
 +  unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
 +  CI.addReg(AX, RegState::Implicit)
 +      .addReg(SP, RegState::Implicit)
 +      .addReg(AX, RegState::Define | RegState::Implicit)
 +      .addReg(SP, RegState::Define | RegState::Implicit)
 +      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 +
 +  if (Is64Bit) {
 +    // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
 +    // themselves. It also does not clobber %rax so we can reuse it when
 +    // adjusting %rsp.
 +    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
 +        .addReg(X86::RSP)
 +        .addReg(X86::RAX);
 +  }
 +}
 +
 +/// emitPrologue - Push callee-saved registers onto the stack, which
 +/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 +/// space for local variables. Also emit labels used by the exception handler to
 +/// generate the exception handling frames.
 +
 +/*
 +  Here's a gist of what gets emitted:
 +
 +  ; Establish frame pointer, if needed
 +  [if needs FP]
 +      push  %rbp
 +      .cfi_def_cfa_offset 16
 +      .cfi_offset %rbp, -16
 +      .seh_pushreg %rpb
 +      mov  %rsp, %rbp
 +      .cfi_def_cfa_register %rbp
 +
 +  ; Spill general-purpose registers
 +  [for all callee-saved GPRs]
 +      pushq %<reg>
 +      [if not needs FP]
 +         .cfi_def_cfa_offset (offset from RETADDR)
 +      .seh_pushreg %<reg>
 +
 +  ; If the required stack alignment > default stack alignment
 +  ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
 +  ; of unknown size in the stack frame.
 +  [if stack needs re-alignment]
 +      and  $MASK, %rsp
 +
 +  ; Allocate space for locals
 +  [if target is Windows and allocated space > 4096 bytes]
 +      ; Windows needs special care for allocations larger
 +      ; than one page.
 +      mov $NNN, %rax
 +      call ___chkstk_ms/___chkstk
 +      sub  %rax, %rsp
 +  [else]
 +      sub  $NNN, %rsp
 +
 +  [if needs FP]
 +      .seh_stackalloc (size of XMM spill slots)
 +      .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
 +  [else]
 +      .seh_stackalloc NNN
 +
 +  ; Spill XMMs
 +  ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
 +  ; they may get spilled on any platform, if the current function
 +  ; calls @llvm.eh.unwind.init
 +  [if needs FP]
 +      [for all callee-saved XMM registers]
 +          movaps  %<xmm reg>, -MMM(%rbp)
 +      [for all callee-saved XMM registers]
 +          .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
 +              ; i.e. the offset relative to (%rbp - SEHFrameOffset)
 +  [else]
 +      [for all callee-saved XMM registers]
 +          movaps  %<xmm reg>, KKK(%rsp)
 +      [for all callee-saved XMM registers]
 +          .seh_savexmm %<xmm reg>, KKK
 +
 +  .seh_endprologue
 +
 +  [if needs base pointer]
 +      mov  %rsp, %rbx
 +      [if needs to restore base pointer]
 +          mov %rsp, -MMM(%rbp)
 +
 +  ; Emit CFI info
 +  [if needs FP]
 +      [for all callee-saved registers]
 +          .cfi_offset %<reg>, (offset from %rbp)
 +  [else]
 +       .cfi_def_cfa_offset (offset from RETADDR)
 +      [for all callee-saved registers]
 +          .cfi_offset %<reg>, (offset from %rsp)
 +
 +  Notes:
 +  - .seh directives are emitted only for Windows 64 ABI
 +  - .cfi directives are emitted for all other ABIs
 +  - for 32-bit code, substitute %e?? registers for %r??
 +*/
 +
 +void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 +  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
 +  MachineBasicBlock::iterator MBBI = MBB.begin();
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const Function *Fn = MF.getFunction();
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  MachineModuleInfo &MMI = MF.getMMI();
 +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
 +  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
 +  bool HasFP = hasFP(MF);
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  bool Is64Bit = STI.is64Bit();
 +  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
 +  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
 +  bool IsWin64 = STI.isTargetWin64();
 +  // Not necessarily synonymous with IsWin64.
 +  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
 +  bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
 +  bool NeedsDwarfCFI =
 +      !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
 +  bool UseLEA = STI.useLeaForSP();
 +  unsigned StackAlign = getStackAlignment();
 +  unsigned SlotSize = RegInfo->getSlotSize();
 +  unsigned FramePtr = RegInfo->getFrameRegister(MF);
 +  const unsigned MachineFramePtr = STI.isTarget64BitILP32() ?
 +                 getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
 +  unsigned StackPtr = RegInfo->getStackRegister();
 +  unsigned BasePtr = RegInfo->getBaseRegister();
 +  DebugLoc DL;
 +
 +  // If we're forcing a stack realignment we can't rely on just the frame
 +  // info, we need to know the ABI stack alignment as well in case we
 +  // have a call out.  Otherwise just make sure we have some alignment - we'll
 +  // go with the minimum SlotSize.
 +  if (ForceStackAlign) {
 +    if (MFI->hasCalls())
 +      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
 +    else if (MaxAlign < SlotSize)
 +      MaxAlign = SlotSize;
 +  }
 +
 +  // Add RETADDR move area to callee saved frame size.
 +  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
 +  if (TailCallReturnAddrDelta < 0)
 +    X86FI->setCalleeSavedFrameSize(
 +      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 +
 +  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
 +
 +  // The default stack probe size is 4096 if the function has no stackprobesize
 +  // attribute.
 +  unsigned StackProbeSize = 4096;
 +  if (Fn->hasFnAttribute("stack-probe-size"))
 +    Fn->getFnAttribute("stack-probe-size")
 +        .getValueAsString()
 +        .getAsInteger(0, StackProbeSize);
 +
 +  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
 +  // function, and use up to 128 bytes of stack space, don't have a frame
 +  // pointer, calls, or dynamic alloca then we do not need to adjust the
 +  // stack pointer (we fit in the Red Zone). We also check that we don't
 +  // push and pop from the stack.
 +  if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
 +                                                   Attribute::NoRedZone) &&
 +      !RegInfo->needsStackRealignment(MF) &&
 +      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
 +      !MFI->adjustsStack() &&                           // No calls.
 +      !IsWin64 &&                                       // Win64 has no Red Zone
 +      !usesTheStack(MF) &&                              // Don't push and pop.
 +      !MF.shouldSplitStack()) {                         // Regular stack
 +    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
 +    if (HasFP) MinSize += SlotSize;
 +    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
 +    MFI->setStackSize(StackSize);
 +  }
 +
 +  // Insert stack pointer adjustment for later moving of return addr.  Only
 +  // applies to tail call optimized functions where the callee argument stack
 +  // size is bigger than the callers.
 +  if (TailCallReturnAddrDelta < 0) {
 +    MachineInstr *MI =
 +      BuildMI(MBB, MBBI, DL,
 +              TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)),
 +              StackPtr)
 +        .addReg(StackPtr)
 +        .addImm(-TailCallReturnAddrDelta)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
 +  }
 +
 +  // Mapping for machine moves:
 +  //
 +  //   DST: VirtualFP AND
 +  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
 +  //        ELSE                        => DW_CFA_def_cfa
 +  //
 +  //   SRC: VirtualFP AND
 +  //        DST: Register               => DW_CFA_def_cfa_register
 +  //
 +  //   ELSE
 +  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
 +  //        REG < 64                    => DW_CFA_offset + Reg
 +  //        ELSE                        => DW_CFA_offset_extended
 +
 +  uint64_t NumBytes = 0;
 +  int stackGrowth = -SlotSize;
 +
 +  if (HasFP) {
 +    // Calculate required stack adjustment.
 +    uint64_t FrameSize = StackSize - SlotSize;
 +    // If required, include space for extra hidden slot for stashing base pointer.
 +    if (X86FI->getRestoreBasePointer())
 +      FrameSize += SlotSize;
 +    if (RegInfo->needsStackRealignment(MF)) {
 +      // Callee-saved registers are pushed on stack before the stack
 +      // is realigned.
 +      FrameSize -= X86FI->getCalleeSavedFrameSize();
 +      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
 +    } else {
 +      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
 +    }
 +
 +    // Get the offset of the stack slot for the EBP register, which is
 +    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
 +    // Update the frame offset adjustment.
 +    MFI->setOffsetAdjustment(-NumBytes);
 +
 +    // Save EBP/RBP into the appropriate stack slot.
 +    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
 +      .addReg(MachineFramePtr, RegState::Kill)
 +      .setMIFlag(MachineInstr::FrameSetup);
 +
 +    if (NeedsDwarfCFI) {
 +      // Mark the place where EBP/RBP was saved.
 +      // Define the current CFA rule to use the provided offset.
 +      assert(StackSize);
 +      unsigned CFIIndex = MMI.addFrameInst(
 +          MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
 +      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +          .addCFIIndex(CFIIndex);
 +
 +      // Change the rule for the FramePtr to be an "offset" rule.
 +      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
 +      CFIIndex = MMI.addFrameInst(
 +          MCCFIInstruction::createOffset(nullptr,
 +                                         DwarfFramePtr, 2 * stackGrowth));
 +      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +          .addCFIIndex(CFIIndex);
 +    }
 +
 +    if (NeedsWinEH) {
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
 +          .addImm(FramePtr)
 +          .setMIFlag(MachineInstr::FrameSetup);
 +    }
 +
 +    // Update EBP with the new base value.
 +    BuildMI(MBB, MBBI, DL,
 +            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr)
 +        .addReg(StackPtr)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +
 +    if (NeedsDwarfCFI) {
 +      // Mark effective beginning of when frame pointer becomes valid.
 +      // Define the current CFA to use the EBP/RBP register.
 +      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
 +      unsigned CFIIndex = MMI.addFrameInst(
 +          MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
 +      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +          .addCFIIndex(CFIIndex);
 +    }
 +
 +    // Mark the FramePtr as live-in in every block.
 +    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
 +      I->addLiveIn(MachineFramePtr);
 +  } else {
 +    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
 +  }
 +
 +  // Skip the callee-saved push instructions.
 +  bool PushedRegs = false;
 +  int StackOffset = 2 * stackGrowth;
 +
 +  while (MBBI != MBB.end() &&
 +         (MBBI->getOpcode() == X86::PUSH32r ||
 +          MBBI->getOpcode() == X86::PUSH64r)) {
 +    PushedRegs = true;
 +    unsigned Reg = MBBI->getOperand(0).getReg();
 +    ++MBBI;
 +
 +    if (!HasFP && NeedsDwarfCFI) {
 +      // Mark callee-saved push instruction.
 +      // Define the current CFA rule to use the provided offset.
 +      assert(StackSize);
 +      unsigned CFIIndex = MMI.addFrameInst(
 +          MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
 +      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +          .addCFIIndex(CFIIndex);
 +      StackOffset += stackGrowth;
 +    }
 +
 +    if (NeedsWinEH) {
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
 +          MachineInstr::FrameSetup);
 +    }
 +  }
 +
 +  // Realign stack after we pushed callee-saved registers (so that we'll be
 +  // able to calculate their offsets from the frame pointer).
 +  if (RegInfo->needsStackRealignment(MF)) {
 +    assert(HasFP && "There should be a frame pointer if stack is realigned.");
 +    uint64_t Val = -MaxAlign;
 +    MachineInstr *MI =
 +      BuildMI(MBB, MBBI, DL,
 +              TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr)
 +      .addReg(StackPtr)
 +      .addImm(Val)
 +      .setMIFlag(MachineInstr::FrameSetup);
 +
 +    // The EFLAGS implicit def is dead.
 +    MI->getOperand(3).setIsDead();
 +  }
 +
 +  // If there is an SUB32ri of ESP immediately before this instruction, merge
 +  // the two. This can be the case when tail call elimination is enabled and
 +  // the callee has more arguments then the caller.
 +  NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
 +
 +  // If there is an ADD32ri or SUB32ri of ESP immediately after this
 +  // instruction, merge the two instructions.
 +  mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
 +
 +  // Adjust stack pointer: ESP -= numbytes.
 +
 +  // Windows and cygwin/mingw require a prologue helper routine when allocating
 +  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
 +  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
 +  // stack and adjust the stack pointer in one go.  The 64-bit version of
 +  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
 +  // responsible for adjusting the stack pointer.  Touching the stack at 4K
 +  // increments is necessary to ensure that the guard pages used by the OS
 +  // virtual memory manager are allocated in correct sequence.
 +  if (NumBytes >= StackProbeSize && UseStackProbe) {
 +    // Check whether EAX is livein for this function.
 +    bool isEAXAlive = isEAXLiveIn(MF);
 +
 +    if (isEAXAlive) {
 +      // Sanity check that EAX is not livein for this function.
 +      // It should not be, so throw an assert.
 +      assert(!Is64Bit && "EAX is livein in x64 case!");
 +
 +      // Save EAX
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
 +        .addReg(X86::EAX, RegState::Kill)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +    }
 +
 +    if (Is64Bit) {
 +      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
 +      // Function prologue is responsible for adjusting the stack pointer.
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
 +        .addImm(NumBytes)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +    } else {
 +      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
 +      // We'll also use 4 already allocated bytes for EAX.
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
 +        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +    }
 +
 +    // Save a pointer to the MI where we set AX.
 +    MachineBasicBlock::iterator SetRAX = MBBI;
 +    --SetRAX;
 +
 +    // Call __chkstk, __chkstk_ms, or __alloca.
 +    emitStackProbeCall(MF, MBB, MBBI, DL);
 +
 +    // Apply the frame setup flag to all inserted instrs.
 +    for (; SetRAX != MBBI; ++SetRAX)
 +      SetRAX->setFlag(MachineInstr::FrameSetup);
 +
 +    if (isEAXAlive) {
 +      // Restore EAX
 +      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
 +                                              X86::EAX),
 +                                      StackPtr, false, NumBytes - 4);
 +      MI->setFlag(MachineInstr::FrameSetup);
 +      MBB.insert(MBBI, MI);
 +    }
 +  } else if (NumBytes) {
 +    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr,
 +                 UseLEA, TII, *RegInfo);
 +  }
 +
 +  int SEHFrameOffset = 0;
 +  if (NeedsWinEH) {
 +    if (HasFP) {
 +      // We need to set frame base offset low enough such that all saved
 +      // register offsets would be positive relative to it, but we can't
 +      // just use NumBytes, because .seh_setframe offset must be <=240.
 +      // So we pretend to have only allocated enough space to spill the
 +      // non-volatile registers.
 +      // We don't care about the rest of stack allocation, because unwinder
 +      // will restore SP to (BP - SEHFrameOffset)
 +      for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
 +        int offset = MFI->getObjectOffset(Info.getFrameIdx());
 +        SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset));
 +      }
 +      SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
 +
 +      // This only needs to account for XMM spill slots, GPR slots
 +      // are covered by the .seh_pushreg's emitted above.
 +      unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
 +      if (Size) {
 +        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
 +            .addImm(Size)
 +            .setMIFlag(MachineInstr::FrameSetup);
 +      }
 +
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
 +          .addImm(FramePtr)
 +          .addImm(SEHFrameOffset)
 +          .setMIFlag(MachineInstr::FrameSetup);
 +    } else {
 +      // SP will be the base register for restoring XMMs
 +      if (NumBytes) {
 +        BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
 +            .addImm(NumBytes)
 +            .setMIFlag(MachineInstr::FrameSetup);
 +      }
 +    }
 +  }
 +
 +  // Skip the rest of register spilling code
 +  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
 +    ++MBBI;
 +
 +  // Emit SEH info for non-GPRs
 +  if (NeedsWinEH) {
 +    for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
 +      unsigned Reg = Info.getReg();
 +      if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
 +        continue;
 +      assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
 +
 +      int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
 +      Offset += SEHFrameOffset;
 +
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
 +          .addImm(Reg)
 +          .addImm(Offset)
 +          .setMIFlag(MachineInstr::FrameSetup);
 +    }
 +
 +    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
 +        .setMIFlag(MachineInstr::FrameSetup);
 +  }
 +
 +  // If we need a base pointer, set it up here. It's whatever the value
 +  // of the stack pointer is at this point. Any variable size objects
 +  // will be allocated after this, so we can still use the base pointer
 +  // to reference locals.
 +  if (RegInfo->hasBasePointer(MF)) {
 +    // Update the base pointer with the current stack pointer.
 +    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
 +    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
 +      .addReg(StackPtr)
 +      .setMIFlag(MachineInstr::FrameSetup);
 +    if (X86FI->getRestoreBasePointer()) {
 +      // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain.
 +      unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
 +      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
 +                   FramePtr, true, X86FI->getRestoreBasePointerOffset())
 +        .addReg(StackPtr)
 +        .setMIFlag(MachineInstr::FrameSetup);
 +    }
 +  }
 +
 +  if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
 +    // Mark end of stack pointer adjustment.
 +    if (!HasFP && NumBytes) {
 +      // Define the current CFA rule to use the provided offset.
 +      assert(StackSize);
 +      unsigned CFIIndex = MMI.addFrameInst(
 +          MCCFIInstruction::createDefCfaOffset(nullptr,
 +                                               -StackSize + stackGrowth));
 +
 +      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
 +          .addCFIIndex(CFIIndex);
 +    }
 +
 +    // Emit DWARF info specifying the offsets of the callee-saved registers.
 +    if (PushedRegs)
 +      emitCalleeSavedFrameMoves(MBB, MBBI, DL);
 +  }
 +}
 +
 +void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 +                                    MachineBasicBlock &MBB) const {
 +  const MachineFrameInfo *MFI = MF.getFrameInfo();
 +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
 +  assert(MBBI != MBB.end() && "Returning block has no instructions");
 +  unsigned RetOpcode = MBBI->getOpcode();
 +  DebugLoc DL = MBBI->getDebugLoc();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  bool Is64Bit = STI.is64Bit();
 +  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
 +  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
 +  const bool Is64BitILP32 = STI.isTarget64BitILP32();
 +  bool UseLEA = STI.useLeaForSP();
 +  unsigned StackAlign = getStackAlignment();
 +  unsigned SlotSize = RegInfo->getSlotSize();
 +  unsigned FramePtr = RegInfo->getFrameRegister(MF);
 +  unsigned MachineFramePtr = Is64BitILP32 ?
 +             getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
 +  unsigned StackPtr = RegInfo->getStackRegister();
 +
 +  bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
 +  bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
 +
 +  switch (RetOpcode) {
 +  default:
 +    llvm_unreachable("Can only insert epilog into returning blocks");
 +  case X86::RETQ:
 +  case X86::RETL:
 +  case X86::RETIL:
 +  case X86::RETIQ:
 +  case X86::TCRETURNdi:
 +  case X86::TCRETURNri:
 +  case X86::TCRETURNmi:
 +  case X86::TCRETURNdi64:
 +  case X86::TCRETURNri64:
 +  case X86::TCRETURNmi64:
 +  case X86::EH_RETURN:
 +  case X86::EH_RETURN64:
 +    break;  // These are ok
 +  }
 +
 +  // Get the number of bytes to allocate from the FrameInfo.
 +  uint64_t StackSize = MFI->getStackSize();
 +  uint64_t MaxAlign  = MFI->getMaxAlignment();
 +  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
 +  uint64_t NumBytes = 0;
 +
 +  // If we're forcing a stack realignment we can't rely on just the frame
 +  // info, we need to know the ABI stack alignment as well in case we
 +  // have a call out.  Otherwise just make sure we have some alignment - we'll
 +  // go with the minimum.
 +  if (ForceStackAlign) {
 +    if (MFI->hasCalls())
 +      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
 +    else
 +      MaxAlign = MaxAlign ? MaxAlign : 4;
 +  }
 +
 +  if (hasFP(MF)) {
 +    // Calculate required stack adjustment.
 +    uint64_t FrameSize = StackSize - SlotSize;
 +    if (RegInfo->needsStackRealignment(MF)) {
 +      // Callee-saved registers were pushed on stack before the stack
 +      // was realigned.
 +      FrameSize -= CSSize;
 +      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
 +    } else {
 +      NumBytes = FrameSize - CSSize;
 +    }
 +
 +    // Pop EBP.
 +    BuildMI(MBB, MBBI, DL,
 +            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
 +  } else {
 +    NumBytes = StackSize - CSSize;
 +  }
 +
 +  // Skip the callee-saved pop instructions.
 +  while (MBBI != MBB.begin()) {
 +    MachineBasicBlock::iterator PI = std::prev(MBBI);
 +    unsigned Opc = PI->getOpcode();
 +
 +    if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
 +        !PI->isTerminator())
 +      break;
 +
 +    --MBBI;
 +  }
 +  MachineBasicBlock::iterator FirstCSPop = MBBI;
 +
 +  DL = MBBI->getDebugLoc();
 +
 +  // If there is an ADD32ri or SUB32ri of ESP immediately before this
 +  // instruction, merge the two instructions.
 +  if (NumBytes || MFI->hasVarSizedObjects())
 +    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
 +
 +  // If dynamic alloca is used, then reset esp to point to the last callee-saved
 +  // slot before popping them off! Same applies for the case, when stack was
 +  // realigned.
 +  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
 +    if (RegInfo->needsStackRealignment(MF))
 +      MBBI = FirstCSPop;
 +    if (CSSize != 0) {
 +      unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
 +      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
 +                   FramePtr, false, -CSSize);
 +      --MBBI;
 +    } else {
 +      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
 +      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
 +        .addReg(FramePtr);
 +      --MBBI;
 +    }
 +  } else if (NumBytes) {
 +    // Adjust stack pointer back: ESP += numbytes.
 +    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA,
 +                 TII, *RegInfo);
 +    --MBBI;
 +  }
 +
 +  // Windows unwinder will not invoke function's exception handler if IP is
 +  // either in prologue or in epilogue.  This behavior causes a problem when a
 +  // call immediately precedes an epilogue, because the return address points
 +  // into the epilogue.  To cope with that, we insert an epilogue marker here,
 +  // then replace it with a 'nop' if it ends up immediately after a CALL in the
 +  // final emitted code.
 +  if (NeedsWinEH)
 +    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 +
 +  // We're returning from function via eh_return.
 +  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
 +    MBBI = MBB.getLastNonDebugInstr();
 +    MachineOperand &DestAddr  = MBBI->getOperand(0);
 +    assert(DestAddr.isReg() && "Offset should be in register!");
 +    BuildMI(MBB, MBBI, DL,
 +            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
 +            StackPtr).addReg(DestAddr.getReg());
 +  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
 +             RetOpcode == X86::TCRETURNmi ||
 +             RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
 +             RetOpcode == X86::TCRETURNmi64) {
 +    bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
 +    // Tail call return: adjust the stack pointer and jump to callee.
 +    MBBI = MBB.getLastNonDebugInstr();
 +    MachineOperand &JumpTarget = MBBI->getOperand(0);
 +    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
 +    assert(StackAdjust.isImm() && "Expecting immediate value.");
 +
 +    // Adjust stack pointer.
 +    int StackAdj = StackAdjust.getImm();
 +    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
 +    int Offset = 0;
 +    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
 +
 +    // Incoporate the retaddr area.
 +    Offset = StackAdj-MaxTCDelta;
 +    assert(Offset >= 0 && "Offset should never be negative");
 +
 +    if (Offset) {
 +      // Check for possible merge with preceding ADD instruction.
 +      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
 +      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr,
 +                   UseLEA, TII, *RegInfo);
 +    }
 +
 +    // Jump to label or value in register.
 +    bool IsWin64 = STI.isTargetWin64();
 +    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
 +      unsigned Op = (RetOpcode == X86::TCRETURNdi)
 +                        ? X86::TAILJMPd
 +                        : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
 +      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
 +      if (JumpTarget.isGlobal())
 +        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
 +                             JumpTarget.getTargetFlags());
 +      else {
 +        assert(JumpTarget.isSymbol());
 +        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
 +                              JumpTarget.getTargetFlags());
 +      }
 +    } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
 +      unsigned Op = (RetOpcode == X86::TCRETURNmi)
 +                        ? X86::TAILJMPm
 +                        : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
 +      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
 +      for (unsigned i = 0; i != 5; ++i)
 +        MIB.addOperand(MBBI->getOperand(i));
 +    } else if (RetOpcode == X86::TCRETURNri64) {
 +      BuildMI(MBB, MBBI, DL,
 +              TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
 +          .addReg(JumpTarget.getReg(), RegState::Kill);
 +    } else {
 +      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
 +        addReg(JumpTarget.getReg(), RegState::Kill);
 +    }
 +
 +    MachineInstr *NewMI = std::prev(MBBI);
 +    NewMI->copyImplicitOps(MF, MBBI);
 +
 +    // Delete the pseudo instruction TCRETURN.
 +    MBB.erase(MBBI);
 +  } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL ||
 +              RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) &&
 +             (X86FI->getTCReturnAddrDelta() < 0)) {
 +    // Add the return addr area delta back since we are not tail calling.
 +    int delta = -1*X86FI->getTCReturnAddrDelta();
 +    MBBI = MBB.getLastNonDebugInstr();
 +
 +    // Check for possible merge with preceding ADD instruction.
 +    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
 +    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII,
 +                 *RegInfo);
 +  }
 +}
 +
 +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 +                                          int FI) const {
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  const MachineFrameInfo *MFI = MF.getFrameInfo();
 +  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
 +  uint64_t StackSize = MFI->getStackSize();
 +
 +  if (RegInfo->hasBasePointer(MF)) {
 +    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
 +    if (FI < 0) {
 +      // Skip the saved EBP.
 +      return Offset + RegInfo->getSlotSize();
 +    } else {
 +      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
 +      return Offset + StackSize;
 +    }
 +  } else if (RegInfo->needsStackRealignment(MF)) {
 +    if (FI < 0) {
 +      // Skip the saved EBP.
 +      return Offset + RegInfo->getSlotSize();
 +    } else {
 +      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
 +      return Offset + StackSize;
 +    }
 +    // FIXME: Support tail calls
 +  } else {
 +    if (!hasFP(MF))
 +      return Offset + StackSize;
 +
 +    // Skip the saved EBP.
 +    Offset += RegInfo->getSlotSize();
 +
 +    // Skip the RETADDR move area
 +    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
 +    if (TailCallReturnAddrDelta < 0)
 +      Offset -= TailCallReturnAddrDelta;
 +  }
 +
 +  return Offset;
 +}
 +
 +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
 +                                             unsigned &FrameReg) const {
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  // We can't calculate offset from frame pointer if the stack is realigned,
 +  // so enforce usage of stack/base pointer.  The base pointer is used when we
 +  // have dynamic allocas in addition to dynamic realignment.
 +  if (RegInfo->hasBasePointer(MF))
 +    FrameReg = RegInfo->getBaseRegister();
 +  else if (RegInfo->needsStackRealignment(MF))
 +    FrameReg = RegInfo->getStackRegister();
 +  else
 +    FrameReg = RegInfo->getFrameRegister(MF);
 +  return getFrameIndexOffset(MF, FI);
 +}
 +
 +// Simplified from getFrameIndexOffset keeping only StackPointer cases
 +int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
 +  const MachineFrameInfo *MFI = MF.getFrameInfo();
 +  // Does not include any dynamic realign.
 +  const uint64_t StackSize = MFI->getStackSize();
 +  {
 +#ifndef NDEBUG
 +    const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
 +    // Note: LLVM arranges the stack as:
 +    // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
 +    //      > "Stack Slots" (<--SP)
 +    // We can always address StackSlots from RSP.  We can usually (unless
 +    // needsStackRealignment) address CSRs from RSP, but sometimes need to
 +    // address them from RBP.  FixedObjects can be placed anywhere in the stack
 +    // frame depending on their specific requirements (i.e. we can actually
 +    // refer to arguments to the function which are stored in the *callers*
 +    // frame).  As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
 +    // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
 +
 +    assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
 +
 +    // We don't handle tail calls, and shouldn't be seeing them
 +    // either.
 +    int TailCallReturnAddrDelta =
 +        MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
 +    assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
 +#endif
 +  }
 +
 +  // This is how the math works out:
 +  //
 +  //  %rsp grows (i.e. gets lower) left to right. Each box below is
 +  //  one word (eight bytes).  Obj0 is the stack slot we're trying to
 +  //  get to.
 +  //
 +  //    ----------------------------------
 +  //    | BP | Obj0 | Obj1 | ... | ObjN |
 +  //    ----------------------------------
 +  //    ^    ^      ^                   ^
 +  //    A    B      C                   E
 +  //
 +  // A is the incoming stack pointer.
 +  // (B - A) is the local area offset (-8 for x86-64) [1]
 +  // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
 +  //
 +  // |(E - B)| is the StackSize (absolute value, positive).  For a
 +  // stack that grown down, this works out to be (B - E). [3]
 +  //
 +  // E is also the value of %rsp after stack has been set up, and we
 +  // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
 +  // (C - E) == (C - A) - (B - A) + (B - E)
 +  //            { Using [1], [2] and [3] above }
 +  //         == getObjectOffset - LocalAreaOffset + StackSize
 +  //
 +
 +  // Get the Offset from the StackPointer
 +  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
 +
 +  return Offset + StackSize;
 +}
 +// Simplified from getFrameIndexReference keeping only StackPointer cases
 +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
 +                                                  unsigned &FrameReg) const {
 +  const X86RegisterInfo *RegInfo =
 +    static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
 +
 +  assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
 +
 +  FrameReg = RegInfo->getStackRegister();
 +  return getFrameIndexOffsetFromSP(MF, FI);
 +}
 +
 +bool X86FrameLowering::assignCalleeSavedSpillSlots(
 +    MachineFunction &MF, const TargetRegisterInfo *TRI,
 +    std::vector<CalleeSavedInfo> &CSI) const {
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  unsigned SlotSize = RegInfo->getSlotSize();
 +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +
 +  unsigned CalleeSavedFrameSize = 0;
 +  int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 +
 +  if (hasFP(MF)) {
 +    // emitPrologue always spills frame register the first thing.
 +    SpillSlotOffset -= SlotSize;
 +    MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
 +
 +    // Since emitPrologue and emitEpilogue will handle spilling and restoring of
 +    // the frame register, we can delete it from CSI list and not have to worry
 +    // about avoiding it later.
 +    unsigned FPReg = RegInfo->getFrameRegister(MF);
 +    for (unsigned i = 0; i < CSI.size(); ++i) {
 +      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
 +        CSI.erase(CSI.begin() + i);
 +        break;
 +      }
 +    }
 +  }
 +
 +  // Assign slots for GPRs. It increases frame size.
 +  for (unsigned i = CSI.size(); i != 0; --i) {
 +    unsigned Reg = CSI[i - 1].getReg();
 +
 +    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
 +      continue;
 +
 +    SpillSlotOffset -= SlotSize;
 +    CalleeSavedFrameSize += SlotSize;
 +
 +    int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
 +    CSI[i - 1].setFrameIdx(SlotIndex);
 +  }
 +
 +  X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
 +
 +  // Assign slots for XMMs.
 +  for (unsigned i = CSI.size(); i != 0; --i) {
 +    unsigned Reg = CSI[i - 1].getReg();
 +    if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
 +      continue;
 +
 +    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 +    // ensure alignment
 +    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
 +    // spill into slot
 +    SpillSlotOffset -= RC->getSize();
 +    int SlotIndex =
 +        MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
 +    CSI[i - 1].setFrameIdx(SlotIndex);
 +    MFI->ensureMaxAlignment(RC->getAlignment());
 +  }
 +
 +  return true;
 +}
 +
 +bool X86FrameLowering::spillCalleeSavedRegisters(
 +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
 +    const std::vector<CalleeSavedInfo> &CSI,
 +    const TargetRegisterInfo *TRI) const {
 +  DebugLoc DL = MBB.findDebugLoc(MI);
 +
 +  MachineFunction &MF = *MBB.getParent();
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +
 +  // Push GPRs. It increases frame size.
 +  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
 +  for (unsigned i = CSI.size(); i != 0; --i) {
 +    unsigned Reg = CSI[i - 1].getReg();
 +
 +    if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
 +      continue;
 +    // Add the callee-saved register as live-in. It's killed at the spill.
 +    MBB.addLiveIn(Reg);
 +
 +    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
 +      .setMIFlag(MachineInstr::FrameSetup);
 +  }
 +
 +  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
 +  // It can be done by spilling XMMs to stack frame.
 +  for (unsigned i = CSI.size(); i != 0; --i) {
 +    unsigned Reg = CSI[i-1].getReg();
 +    if (X86::GR64RegClass.contains(Reg) ||
 +        X86::GR32RegClass.contains(Reg))
 +      continue;
 +    // Add the callee-saved register as live-in. It's killed at the spill.
 +    MBB.addLiveIn(Reg);
 +    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
 +
 +    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
 +                            TRI);
 +    --MI;
 +    MI->setFlag(MachineInstr::FrameSetup);
 +    ++MI;
 +  }
 +
 +  return true;
 +}
 +
 +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 +                                               MachineBasicBlock::iterator MI,
 +                                        const std::vector<CalleeSavedInfo> &CSI,
 +                                          const TargetRegisterInfo *TRI) const {
 +  if (CSI.empty())
 +    return false;
 +
 +  DebugLoc DL = MBB.findDebugLoc(MI);
 +
 +  MachineFunction &MF = *MBB.getParent();
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +
 +  // Reload XMMs from stack frame.
 +  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
 +    unsigned Reg = CSI[i].getReg();
 +    if (X86::GR64RegClass.contains(Reg) ||
 +        X86::GR32RegClass.contains(Reg))
 +      continue;
 +
 +    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
 +    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
 +  }
 +
 +  // POP GPRs.
 +  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
 +  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
 +    unsigned Reg = CSI[i].getReg();
 +    if (!X86::GR64RegClass.contains(Reg) &&
 +        !X86::GR32RegClass.contains(Reg))
 +      continue;
 +
 +    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
 +  }
 +  return true;
 +}
 +
 +void
 +X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 +                                                       RegScavenger *RS) const {
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const X86RegisterInfo *RegInfo =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 +  unsigned SlotSize = RegInfo->getSlotSize();
 +
 +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
 +
 +  if (TailCallReturnAddrDelta < 0) {
 +    // create RETURNADDR area
 +    //   arg
 +    //   arg
 +    //   RETADDR
 +    //   { ...
 +    //     RETADDR area
 +    //     ...
 +    //   }
 +    //   [EBP]
 +    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
 +                           TailCallReturnAddrDelta - SlotSize, true);
 +  }
 +
 +  // Spill the BasePtr if it's used.
 +  if (RegInfo->hasBasePointer(MF))
 +    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
 +}
 +
 +static bool
 +HasNestArgument(const MachineFunction *MF) {
 +  const Function *F = MF->getFunction();
 +  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
 +       I != E; I++) {
 +    if (I->hasNestAttr())
 +      return true;
 +  }
 +  return false;
 +}
 +
 +/// GetScratchRegister - Get a temp register for performing work in the
 +/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
 +/// and the properties of the function either one or two registers will be
 +/// needed. Set primary to true for the first register, false for the second.
 +static unsigned
 +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
 +  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
 +
 +  // Erlang stuff.
 +  if (CallingConvention == CallingConv::HiPE) {
 +    if (Is64Bit)
 +      return Primary ? X86::R14 : X86::R13;
 +    else
 +      return Primary ? X86::EBX : X86::EDI;
 +  }
 +
 +  if (Is64Bit) {
 +    if (IsLP64)
 +      return Primary ? X86::R11 : X86::R12;
 +    else
 +      return Primary ? X86::R11D : X86::R12D;
 +  }
 +
 +  bool IsNested = HasNestArgument(&MF);
 +
 +  if (CallingConvention == CallingConv::X86_FastCall ||
 +      CallingConvention == CallingConv::Fast) {
 +    if (IsNested)
 +      report_fatal_error("Segmented stacks does not support fastcall with "
 +                         "nested function.");
 +    return Primary ? X86::EAX : X86::ECX;
 +  }
 +  if (IsNested)
 +    return Primary ? X86::EDX : X86::EAX;
 +  return Primary ? X86::ECX : X86::EAX;
 +}
 +
 +// The stack limit in the TCB is set to this many bytes above the actual stack
 +// limit.
 +static const uint64_t kSplitStackAvailable = 256;
 +
 +void
 +X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 +  MachineBasicBlock &prologueMBB = MF.front();
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  uint64_t StackSize;
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  bool Is64Bit = STI.is64Bit();
 +  const bool IsLP64 = STI.isTarget64BitLP64();
 +  unsigned TlsReg, TlsOffset;
 +  DebugLoc DL;
 +
 +  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
 +  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
 +         "Scratch register is live-in");
 +
 +  if (MF.getFunction()->isVarArg())
 +    report_fatal_error("Segmented stacks do not support vararg functions.");
 +  if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
 +      !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
 +      !STI.isTargetDragonFly())
 +    report_fatal_error("Segmented stacks not supported on this platform.");
 +
 +  // Eventually StackSize will be calculated by a link-time pass; which will
 +  // also decide whether checking code needs to be injected into this particular
 +  // prologue.
 +  StackSize = MFI->getStackSize();
 +
 +  // Do not generate a prologue for functions with a stack of size zero
 +  if (StackSize == 0)
 +    return;
 +
 +  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
 +  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
 +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 +  bool IsNested = false;
 +
 +  // We need to know if the function has a nest argument only in 64 bit mode.
 +  if (Is64Bit)
 +    IsNested = HasNestArgument(&MF);
 +
 +  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
 +  // allocMBB needs to be last (terminating) instruction.
 +
 +  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
 +         e = prologueMBB.livein_end(); i != e; i++) {
 +    allocMBB->addLiveIn(*i);
 +    checkMBB->addLiveIn(*i);
 +  }
 +
 +  if (IsNested)
 +    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
 +
 +  MF.push_front(allocMBB);
 +  MF.push_front(checkMBB);
 +
 +  // When the frame size is less than 256 we just compare the stack
 +  // boundary directly to the value of the stack pointer, per gcc.
 +  bool CompareStackPointer = StackSize < kSplitStackAvailable;
 +
 +  // Read the limit off the current stacklet off the stack_guard location.
 +  if (Is64Bit) {
 +    if (STI.isTargetLinux()) {
 +      TlsReg = X86::FS;
 +      TlsOffset = IsLP64 ? 0x70 : 0x40;
 +    } else if (STI.isTargetDarwin()) {
 +      TlsReg = X86::GS;
 +      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
 +    } else if (STI.isTargetWin64()) {
 +      TlsReg = X86::GS;
 +      TlsOffset = 0x28; // pvArbitrary, reserved for application use
 +    } else if (STI.isTargetFreeBSD()) {
 +      TlsReg = X86::FS;
 +      TlsOffset = 0x18;
 +    } else if (STI.isTargetDragonFly()) {
 +      TlsReg = X86::FS;
 +      TlsOffset = 0x20; // use tls_tcb.tcb_segstack
 +    } else {
 +      report_fatal_error("Segmented stacks not supported on this platform.");
 +    }
 +
 +    if (CompareStackPointer)
 +      ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
 +    else
 +      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
 +        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 +
 +    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
 +      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
 +  } else {
 +    if (STI.isTargetLinux()) {
 +      TlsReg = X86::GS;
 +      TlsOffset = 0x30;
 +    } else if (STI.isTargetDarwin()) {
 +      TlsReg = X86::GS;
 +      TlsOffset = 0x48 + 90*4;
 +    } else if (STI.isTargetWin32()) {
 +      TlsReg = X86::FS;
 +      TlsOffset = 0x14; // pvArbitrary, reserved for application use
 +    } else if (STI.isTargetDragonFly()) {
 +      TlsReg = X86::FS;
 +      TlsOffset = 0x10; // use tls_tcb.tcb_segstack
 +    } else if (STI.isTargetFreeBSD()) {
 +      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
 +    } else {
 +      report_fatal_error("Segmented stacks not supported on this platform.");
 +    }
 +
 +    if (CompareStackPointer)
 +      ScratchReg = X86::ESP;
 +    else
 +      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
 +        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 +
 +    if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
 +        STI.isTargetDragonFly()) {
 +      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
 +        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
 +    } else if (STI.isTargetDarwin()) {
 +
 +      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
 +      unsigned ScratchReg2;
 +      bool SaveScratch2;
 +      if (CompareStackPointer) {
 +        // The primary scratch register is available for holding the TLS offset.
 +        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
 +        SaveScratch2 = false;
 +      } else {
 +        // Need to use a second register to hold the TLS offset
 +        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
 +
 +        // Unfortunately, with fastcc the second scratch register may hold an
 +        // argument.
 +        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
 +      }
 +
 +      // If Scratch2 is live-in then it needs to be saved.
 +      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
 +             "Scratch register is live-in and not saved");
 +
 +      if (SaveScratch2)
 +        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
 +          .addReg(ScratchReg2, RegState::Kill);
 +
 +      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
 +        .addImm(TlsOffset);
 +      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
 +        .addReg(ScratchReg)
 +        .addReg(ScratchReg2).addImm(1).addReg(0)
 +        .addImm(0)
 +        .addReg(TlsReg);
 +
 +      if (SaveScratch2)
 +        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
 +    }
 +  }
 +
 +  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
 +  // It jumps to normal execution of the function body.
 +  BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB);
 +
 +  // On 32 bit we first push the arguments size and then the frame size. On 64
 +  // bit, we pass the stack frame size in r10 and the argument size in r11.
 +  if (Is64Bit) {
 +    // Functions with nested arguments use R10, so it needs to be saved across
 +    // the call to _morestack
 +
 +    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
 +    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
 +    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
 +    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
 +    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
 +
 +    if (IsNested)
 +      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
 +
 +    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
 +      .addImm(StackSize);
 +    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
 +      .addImm(X86FI->getArgumentStackSize());
 +    MF.getRegInfo().setPhysRegUsed(Reg10);
 +    MF.getRegInfo().setPhysRegUsed(Reg11);
 +  } else {
 +    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
 +      .addImm(X86FI->getArgumentStackSize());
 +    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
 +      .addImm(StackSize);
 +  }
 +
 +  // __morestack is in libgcc
 +  if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
 +    // Under the large code model, we cannot assume that __morestack lives
 +    // within 2^31 bytes of the call site, so we cannot use pc-relative
 +    // addressing. We cannot perform the call via a temporary register,
 +    // as the rax register may be used to store the static chain, and all
 +    // other suitable registers may be either callee-save or used for
 +    // parameter passing. We cannot use the stack at this point either
 +    // because __morestack manipulates the stack directly.
 +    //
 +    // To avoid these issues, perform an indirect call via a read-only memory
 +    // location containing the address.
 +    //
 +    // This solution is not perfect, as it assumes that the .rodata section
 +    // is laid out within 2^31 bytes of each function body, but this seems
 +    // to be sufficient for JIT.
 +    BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
 +        .addReg(X86::RIP)
 +        .addImm(0)
 +        .addReg(0)
 +        .addExternalSymbol("__morestack_addr")
 +        .addReg(0);
 +    MF.getMMI().setUsesMorestackAddr(true);
 +  } else {
 +    if (Is64Bit)
 +      BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
 +        .addExternalSymbol("__morestack");
 +    else
 +      BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
 +        .addExternalSymbol("__morestack");
 +  }
 +
 +  if (IsNested)
 +    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
 +  else
 +    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
 +
 +  allocMBB->addSuccessor(&prologueMBB);
 +
 +  checkMBB->addSuccessor(allocMBB);
 +  checkMBB->addSuccessor(&prologueMBB);
 +
 +#ifdef XDEBUG
 +  MF.verify();
 +#endif
 +}
 +
 +/// Erlang programs may need a special prologue to handle the stack size they
 +/// might need at runtime. That is because Erlang/OTP does not implement a C
 +/// stack but uses a custom implementation of hybrid stack/heap architecture.
 +/// (for more information see Eric Stenman's Ph.D. thesis:
 +/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
 +///
 +/// CheckStack:
 +///       temp0 = sp - MaxStack
 +///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 +/// OldStart:
 +///       ...
 +/// IncStack:
 +///       call inc_stack   # doubles the stack space
 +///       temp0 = sp - MaxStack
 +///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 +void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  MachineFrameInfo *MFI = MF.getFrameInfo();
 +  const unsigned SlotSize =
 +      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo())
 +          ->getSlotSize();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  const bool Is64Bit = STI.is64Bit();
 +  const bool IsLP64 = STI.isTarget64BitLP64();
 +  DebugLoc DL;
 +  // HiPE-specific values
 +  const unsigned HipeLeafWords = 24;
 +  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
 +  const unsigned Guaranteed = HipeLeafWords * SlotSize;
 +  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
 +                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
 +  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
 +
 +  assert(STI.isTargetLinux() &&
 +         "HiPE prologue is only supported on Linux operating systems.");
 +
 +  // Compute the largest caller's frame that is needed to fit the callees'
 +  // frames. This 'MaxStack' is computed from:
 +  //
 +  // a) the fixed frame size, which is the space needed for all spilled temps,
 +  // b) outgoing on-stack parameter areas, and
 +  // c) the minimum stack space this function needs to make available for the
 +  //    functions it calls (a tunable ABI property).
 +  if (MFI->hasCalls()) {
 +    unsigned MoreStackForCalls = 0;
 +
 +    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
 +         MBBI != MBBE; ++MBBI)
 +      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
 +           MI != ME; ++MI) {
 +        if (!MI->isCall())
 +          continue;
 +
 +        // Get callee operand.
 +        const MachineOperand &MO = MI->getOperand(0);
 +
 +        // Only take account of global function calls (no closures etc.).
 +        if (!MO.isGlobal())
 +          continue;
 +
 +        const Function *F = dyn_cast<Function>(MO.getGlobal());
 +        if (!F)
 +          continue;
 +
 +        // Do not update 'MaxStack' for primitive and built-in functions
 +        // (encoded with names either starting with "erlang."/"bif_" or not
 +        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
 +        // "_", such as the BIF "suspend_0") as they are executed on another
 +        // stack.
 +        if (F->getName().find("erlang.") != StringRef::npos ||
 +            F->getName().find("bif_") != StringRef::npos ||
 +            F->getName().find_first_of("._") == StringRef::npos)
 +          continue;
 +
 +        unsigned CalleeStkArity =
 +          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
 +        if (HipeLeafWords - 1 > CalleeStkArity)
 +          MoreStackForCalls = std::max(MoreStackForCalls,
 +                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
 +      }
 +    MaxStack += MoreStackForCalls;
 +  }
 +
 +  // If the stack frame needed is larger than the guaranteed then runtime checks
 +  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
 +  if (MaxStack > Guaranteed) {
 +    MachineBasicBlock &prologueMBB = MF.front();
 +    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
 +    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
 +
 +    for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
 +           E = prologueMBB.livein_end(); I != E; I++) {
 +      stackCheckMBB->addLiveIn(*I);
 +      incStackMBB->addLiveIn(*I);
 +    }
 +
 +    MF.push_front(incStackMBB);
 +    MF.push_front(stackCheckMBB);
 +
 +    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
 +    unsigned LEAop, CMPop, CALLop;
 +    if (Is64Bit) {
 +      SPReg = X86::RSP;
 +      PReg  = X86::RBP;
 +      LEAop = X86::LEA64r;
 +      CMPop = X86::CMP64rm;
 +      CALLop = X86::CALL64pcrel32;
 +      SPLimitOffset = 0x90;
 +    } else {
 +      SPReg = X86::ESP;
 +      PReg  = X86::EBP;
 +      LEAop = X86::LEA32r;
 +      CMPop = X86::CMP32rm;
 +      CALLop = X86::CALLpcrel32;
 +      SPLimitOffset = 0x4c;
 +    }
 +
 +    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
 +    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
 +           "HiPE prologue scratch register is live-in");
 +
 +    // Create new MBB for StackCheck:
 +    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
 +                 SPReg, false, -MaxStack);
 +    // SPLimitOffset is in a fixed heap location (pointed by BP).
 +    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
 +                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
 +    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB);
 +
 +    // Create new MBB for IncStack:
 +    BuildMI(incStackMBB, DL, TII.get(CALLop)).
 +      addExternalSymbol("inc_stack_0");
 +    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
 +                 SPReg, false, -MaxStack);
 +    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
 +                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
 +    BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
 +
 +    stackCheckMBB->addSuccessor(&prologueMBB, 99);
 +    stackCheckMBB->addSuccessor(incStackMBB, 1);
 +    incStackMBB->addSuccessor(&prologueMBB, 99);
 +    incStackMBB->addSuccessor(incStackMBB, 1);
 +  }
 +#ifdef XDEBUG
 +  MF.verify();
 +#endif
 +}
 +
 +void X86FrameLowering::
 +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 +                              MachineBasicBlock::iterator I) const {
 +  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 +  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
 +                                       MF.getSubtarget().getRegisterInfo());
 +  unsigned StackPtr = RegInfo.getStackRegister();
 +  bool reserveCallFrame = hasReservedCallFrame(MF);
 +  int Opcode = I->getOpcode();
 +  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
 +  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 +  bool IsLP64 = STI.isTarget64BitLP64();
 +  DebugLoc DL = I->getDebugLoc();
 +  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
 +  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
 +  I = MBB.erase(I);
 +
 +  if (!reserveCallFrame) {
 +    // If the stack pointer can be changed after prologue, turn the
 +    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
 +    // adjcallstackdown instruction into 'add ESP, <amt>'
 +    if (Amount == 0)
 +      return;
 +
 +    // We need to keep the stack aligned properly.  To do this, we round the
 +    // amount of space needed for the outgoing arguments up to the next
 +    // alignment boundary.
 +    unsigned StackAlign = MF.getTarget()
 +                              .getSubtargetImpl()
 +                              ->getFrameLowering()
 +                              ->getStackAlignment();
 +    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 +
 +    MachineInstr *New = nullptr;
 +
 +    // Factor out the amount that gets handled inside the sequence
 +    // (Pushes of argument for frame setup, callee pops for frame destroy)
 +    Amount -= InternalAmt;
 +
 +    if (Amount) {
 +      if (Opcode == TII.getCallFrameSetupOpcode()) {
 +        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
 +          .addReg(StackPtr).addImm(Amount);
 +      } else {
 +        assert(Opcode == TII.getCallFrameDestroyOpcode());
 +
 +        unsigned Opc = getADDriOpcode(IsLP64, Amount);
 +        New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
 +          .addReg(StackPtr).addImm(Amount);
 +      }
 +    }
 +
 +    if (New) {
 +      // The EFLAGS implicit def is dead.
 +      New->getOperand(3).setIsDead();
 +
 +      // Replace the pseudo instruction with a new instruction.
 +      MBB.insert(I, New);
 +    }
 +
 +    return;
 +  }
 +
 +  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
 +    // If we are performing frame pointer elimination and if the callee pops
 +    // something off the stack pointer, add it back.  We do this until we have
 +    // more advanced stack pointer tracking ability.
 +    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
 +    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
 +      .addReg(StackPtr).addImm(InternalAmt);
 +
 +    // The EFLAGS implicit def is dead.
 +    New->getOperand(3).setIsDead();
 +
 +    // We are not tracking the stack pointer adjustment by the callee, so make
 +    // sure we restore the stack pointer immediately after the call, there may
 +    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
 +    MachineBasicBlock::iterator B = MBB.begin();
 +    while (I != B && !std::prev(I)->isCall())
 +      --I;
 +    MBB.insert(I, New);
 +  }
 +}
 +
 diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index dd8fc3240c2..d93c9467182 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -1,95 +1,97 @@ -//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class implements X86-specific bits of TargetFrameLowering class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H -#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H - -#include "llvm/Target/TargetFrameLowering.h" - -namespace llvm { - -class MCSymbol; -class X86TargetMachine; -class X86Subtarget; - -class X86FrameLowering : public TargetFrameLowering { -public: -  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) -    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} - -  /// Emit a call to the target's stack probe function. This is required for all -  /// large stack allocations on Windows. The caller is required to materialize -  /// the number of bytes to probe in RAX/EAX. -  static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, -                                 MachineBasicBlock::iterator MBBI, DebugLoc DL); - -  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, -                                 MachineBasicBlock::iterator MBBI, -                                 DebugLoc DL) const; - -  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into -  /// the function. -  void emitPrologue(MachineFunction &MF) const override; -  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - -  void adjustForSegmentedStacks(MachineFunction &MF) const override; - -  void adjustForHiPEPrologue(MachineFunction &MF) const override; - -  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, -                                     RegScavenger *RS = nullptr) const override; - -  bool -  assignCalleeSavedSpillSlots(MachineFunction &MF, -                              const TargetRegisterInfo *TRI, -                              std::vector<CalleeSavedInfo> &CSI) const override; - -  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, -                                 MachineBasicBlock::iterator MI, -                                 const std::vector<CalleeSavedInfo> &CSI, -                                 const TargetRegisterInfo *TRI) const override; - -  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, -                                  MachineBasicBlock::iterator MI, -                                  const std::vector<CalleeSavedInfo> &CSI, -                                  const TargetRegisterInfo *TRI) const override; - -  bool hasFP(const MachineFunction &MF) const override; -  bool hasReservedCallFrame(const MachineFunction &MF) const override; - -  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; -  int getFrameIndexReference(const MachineFunction &MF, int FI, -                             unsigned &FrameReg) const override; - -  int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; -  int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, -                                   unsigned &FrameReg) const override; - -  void eliminateCallFramePseudoInstr(MachineFunction &MF, -                                 MachineBasicBlock &MBB, -                                 MachineBasicBlock::iterator MI) const override; - -private: -  /// convertArgMovsToPushes - This method tries to convert a call sequence -  /// that uses sub and mov instructions to put the argument onto the stack -  /// into a series of pushes. -  /// Returns true if the transformation succeeded, false if not. -  bool convertArgMovsToPushes(MachineFunction &MF,  -                              MachineBasicBlock &MBB, -                              MachineBasicBlock::iterator I,  -                              uint64_t Amount) const; -}; - -} // End llvm namespace - -#endif +//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This class implements X86-specific bits of TargetFrameLowering class.
 +//
 +//===----------------------------------------------------------------------===//
 +
 +#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 +#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 +
 +#include "llvm/Target/TargetFrameLowering.h"
 +
 +namespace llvm {
 +
 +class MCSymbol;
 +class X86TargetMachine;
 +class X86Subtarget;
 +
 +class X86FrameLowering : public TargetFrameLowering {
 +public:
 +  explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
 +    : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 +
 +  /// Emit a call to the target's stack probe function. This is required for all
 +  /// large stack allocations on Windows. The caller is required to materialize
 +  /// the number of bytes to probe in RAX/EAX.
 +  static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
 +                                 MachineBasicBlock::iterator MBBI, DebugLoc DL);
 +
 +  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
 +                                 MachineBasicBlock::iterator MBBI,
 +                                 DebugLoc DL) const;
 +
 +  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
 +  /// the function.
 +  void emitPrologue(MachineFunction &MF) const override;
 +  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 +
 +  void adjustForSegmentedStacks(MachineFunction &MF) const override;
 +
 +  void adjustForHiPEPrologue(MachineFunction &MF) const override;
 +
 +  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 +                                     RegScavenger *RS = nullptr) const override;
 +
 +  bool
 +  assignCalleeSavedSpillSlots(MachineFunction &MF,
 +                              const TargetRegisterInfo *TRI,
 +                              std::vector<CalleeSavedInfo> &CSI) const override;
 +
 +  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 +                                 MachineBasicBlock::iterator MI,
 +                                 const std::vector<CalleeSavedInfo> &CSI,
 +                                 const TargetRegisterInfo *TRI) const override;
 +
 +  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 +                                  MachineBasicBlock::iterator MI,
 +                                  const std::vector<CalleeSavedInfo> &CSI,
 +                                  const TargetRegisterInfo *TRI) const override;
 +
 +  bool hasFP(const MachineFunction &MF) const override;
 +  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 +  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
 +  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 +
 +  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
 +  int getFrameIndexReference(const MachineFunction &MF, int FI,
 +                             unsigned &FrameReg) const override;
 +
 +  int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
 +  int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
 +                                   unsigned &FrameReg) const override;
 +
 +  void eliminateCallFramePseudoInstr(MachineFunction &MF,
 +                                 MachineBasicBlock &MBB,
 +                                 MachineBasicBlock::iterator MI) const override;
 +
 +private:
 +  /// convertArgMovsToPushes - This method tries to convert a call sequence
 +  /// that uses sub and mov instructions to put the argument onto the stack
 +  /// into a series of pushes.
 +  /// Returns true if the transformation succeeded, false if not.
 +  bool convertArgMovsToPushes(MachineFunction &MF, 
 +                              MachineBasicBlock &MBB,
 +                              MachineBasicBlock::iterator I, 
 +                              uint64_t Amount) const;
 +};
 +
 +} // End llvm namespace
 +
 +#endif
 diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index be0e4b790a2..fa1dfa7a524 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1,1848 +1,1852 @@ -//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// -// -//                     The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes the various pseudo instructions used by the compiler, -// as well as Pat patterns used during instruction selection. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Pattern Matching Support - -def GetLo32XForm : SDNodeXForm<imm, [{ -  // Transformation function: get the low 32 bits. -  return getI32Imm((unsigned)N->getZExtValue()); -}]>; - -def GetLo8XForm : SDNodeXForm<imm, [{ -  // Transformation function: get the low 8 bits. -  return getI8Imm((uint8_t)N->getZExtValue()); -}]>; - - -//===----------------------------------------------------------------------===// -// Random Pseudo Instructions. - -// PIC base construction.  This expands to code that looks like this: -//     call  $next_inst -//     popl %destreg" -let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in -  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), -                      "", []>; - - -// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into -// a stack adjustment and the codegen must know that they may modify the stack -// pointer before prolog-epilog rewriting occurs. -// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become -// sub / add which can clobber EFLAGS. -let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), -                           "#ADJCALLSTACKDOWN", -                           [(X86callseq_start timm:$amt)]>, -                          Requires<[NotLP64]>; -def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), -                           "#ADJCALLSTACKUP", -                           [(X86callseq_end timm:$amt1, timm:$amt2)]>, -                          Requires<[NotLP64]>; -} - -// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into -// a stack adjustment and the codegen must know that they may modify the stack -// pointer before prolog-epilog rewriting occurs. -// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become -// sub / add which can clobber EFLAGS. -let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), -                           "#ADJCALLSTACKDOWN", -                           [(X86callseq_start timm:$amt)]>, -                          Requires<[IsLP64]>; -def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), -                           "#ADJCALLSTACKUP", -                           [(X86callseq_end timm:$amt1, timm:$amt2)]>, -                          Requires<[IsLP64]>; -} - - - -// x86-64 va_start lowering magic. -let usesCustomInserter = 1, Defs = [EFLAGS] in { -def VASTART_SAVE_XMM_REGS : I<0, Pseudo, -                              (outs), -                              (ins GR8:$al, -                                   i64imm:$regsavefi, i64imm:$offset, -                                   variable_ops), -                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", -                              [(X86vastart_save_xmm_regs GR8:$al, -                                                         imm:$regsavefi, -                                                         imm:$offset), -                               (implicit EFLAGS)]>; - -// The VAARG_64 pseudo-instruction takes the address of the va_list, -// and places the address of the next argument into a register. -let Defs = [EFLAGS] in -def VAARG_64 : I<0, Pseudo, -                 (outs GR64:$dst), -                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), -                 "#VAARG_64 $dst, $ap, $size, $mode, $align", -                 [(set GR64:$dst, -                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), -                  (implicit EFLAGS)]>; - -// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows -// targets.  These calls are needed to probe the stack when allocating more than -// 4k bytes in one go. Touching the stack at 4K increments is necessary to -// ensure that the guard pages used by the OS virtual memory manager are -// allocated in correct sequence. -// The main point of having separate instruction are extra unmodelled effects -// (compared to ordinary calls) like stack pointer change. - -let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in -  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), -                     "# dynamic stack allocation", -                     [(X86WinAlloca)]>; - -// When using segmented stacks these are lowered into instructions which first -// check if the current stacklet has enough free memory. If it does, memory is -// allocated by bumping the stack pointer. Otherwise memory is allocated from -// the heap. - -let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in -def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), -                      "# variable sized alloca for segmented stacks", -                      [(set GR32:$dst, -                         (X86SegAlloca GR32:$size))]>, -                    Requires<[NotLP64]>; - -let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in -def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), -                      "# variable sized alloca for segmented stacks", -                      [(set GR64:$dst, -                         (X86SegAlloca GR64:$size))]>, -                    Requires<[In64BitMode]>; -} - -// The MSVC runtime contains an _ftol2 routine for converting floating-point -// to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is -// used as a temporary register. No other registers (aside from flags) are -// touched. -// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 -// variant is unnecessary. - -let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { -  def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), -                      "# win32 fptoui", -                      [(X86WinFTOL RFP32:$src)]>, -                    Requires<[Not64BitMode]>; - -  def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), -                      "# win32 fptoui", -                      [(X86WinFTOL RFP64:$src)]>, -                    Requires<[Not64BitMode]>; -} - -//===----------------------------------------------------------------------===// -// EH Pseudo Instructions -// -let SchedRW = [WriteSystem] in { -let isTerminator = 1, isReturn = 1, isBarrier = 1, -    hasCtrlDep = 1, isCodeGenOnly = 1 in { -def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr), -                    "ret\t#eh_return, addr: $addr", -                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; - -} - -let isTerminator = 1, isReturn = 1, isBarrier = 1, -    hasCtrlDep = 1, isCodeGenOnly = 1 in { -def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr), -                     "ret\t#eh_return, addr: $addr", -                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; - -} - -let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, -    usesCustomInserter = 1 in { -  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), -                            "#EH_SJLJ_SETJMP32", -                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, -                          Requires<[Not64BitMode]>; -  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), -                            "#EH_SJLJ_SETJMP64", -                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, -                          Requires<[In64BitMode]>; -  let isTerminator = 1 in { -  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), -                            "#EH_SJLJ_LONGJMP32", -                            [(X86eh_sjlj_longjmp addr:$buf)]>, -                          Requires<[Not64BitMode]>; -  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), -                            "#EH_SJLJ_LONGJMP64", -                            [(X86eh_sjlj_longjmp addr:$buf)]>, -                          Requires<[In64BitMode]>; -  } -} -} // SchedRW - -let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { -  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), -                        "#EH_SjLj_Setup\t$dst", []>; -} - -//===----------------------------------------------------------------------===// -// Pseudo instructions used by unwind info. -// -let isPseudo = 1 in { -  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), -                            "#SEH_PushReg $reg", []>; -  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), -                            "#SEH_SaveReg $reg, $dst", []>; -  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), -                            "#SEH_SaveXMM $reg, $dst", []>; -  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), -                            "#SEH_StackAlloc $size", []>; -  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), -                            "#SEH_SetFrame $reg, $offset", []>; -  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), -                            "#SEH_PushFrame $mode", []>; -  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), -                            "#SEH_EndPrologue", []>; -  def SEH_Epilogue : I<0, Pseudo, (outs), (ins), -                            "#SEH_Epilogue", []>; -} - -//===----------------------------------------------------------------------===// -// Pseudo instructions used by segmented stacks. -// - -// This is lowered into a RET instruction by MCInstLower.  We need -// this so that we don't have to have a MachineBasicBlock which ends -// with a RET and also has successors. -let isPseudo = 1 in { -def MORESTACK_RET: I<0, Pseudo, (outs), (ins), -                          "", []>; - -// This instruction is lowered to a RET followed by a MOV.  The two -// instructions are not generated on a higher level since then the -// verifier sees a MachineBasicBlock ending with a non-terminator. -def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), -                                  "", []>; -} - -//===----------------------------------------------------------------------===// -// Alias Instructions -//===----------------------------------------------------------------------===// - -// Alias instruction mapping movr0 to xor. -// FIXME: remove when we can teach regalloc that xor reg, reg is ok. -let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, -    isPseudo = 1 in -def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "", -                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; - -// Other widths can also make use of the 32-bit xor, which may have a smaller -// encoding and avoid partial register updates. -def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; -def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; -def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { -  let AddedComplexity = 20; -} - -// Materialize i64 constant where top 32-bits are zero. This could theoretically -// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however -// that would make it more difficult to rematerialize. -let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, -    isCodeGenOnly = 1, hasSideEffects = 0 in -def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), -                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; - -// This 64-bit pseudo-move can be used for both a 64-bit constant that is -// actually the zero-extension of a 32-bit constant, and for labels in the -// x86-64 small code model. -def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; - -let AddedComplexity = 1 in -def : Pat<(i64 mov64imm32:$src), -          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; - -// Use sbb to materialize carry bit. -let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { -// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. -// However, Pat<> can't replicate the destination reg into the inputs of the -// result. -def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", -                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", -                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", -                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", -                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -} // isCodeGenOnly - - -def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C16r)>; -def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C32r)>; -def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C64r)>; - -def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C16r)>; -def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C32r)>; -def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C64r)>; - -// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and -// will be eliminated and that the sbb can be extended up to a wider type.  When -// this happens, it is great.  However, if we are left with an 8-bit sbb and an -// and, we might as well just match it as a setb. -def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), -          (SETBr)>; - -// (add OP, SETB) -> (adc OP, 0) -def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), -          (ADC8ri GR8:$op, 0)>; -def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), -          (ADC32ri8 GR32:$op, 0)>; -def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), -          (ADC64ri8 GR64:$op, 0)>; - -// (sub OP, SETB) -> (sbb OP, 0) -def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), -          (SBB8ri GR8:$op, 0)>; -def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), -          (SBB32ri8 GR32:$op, 0)>; -def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), -          (SBB64ri8 GR64:$op, 0)>; - -// (sub OP, SETCC_CARRY) -> (adc OP, 0) -def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), -          (ADC8ri GR8:$op, 0)>; -def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), -          (ADC32ri8 GR32:$op, 0)>; -def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), -          (ADC64ri8 GR64:$op, 0)>; - -//===----------------------------------------------------------------------===// -// String Pseudo Instructions -// -let SchedRW = [WriteMicrocoded] in { -let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { -def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", -                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP, -                   Requires<[Not64BitMode]>; -def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", -                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, -                   Requires<[Not64BitMode]>; -def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", -                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, -                   Requires<[Not64BitMode]>; -} - -let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { -def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", -                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP, -                   Requires<[In64BitMode]>; -def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", -                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, -                   Requires<[In64BitMode]>; -def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", -                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, -                   Requires<[In64BitMode]>; -def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", -                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP, -                   Requires<[In64BitMode]>; -} - -// FIXME: Should use "(X86rep_stos AL)" as the pattern. -let Defs = [ECX,EDI], isCodeGenOnly = 1 in { -  let Uses = [AL,ECX,EDI] in -  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", -                      [(X86rep_stos i8)], IIC_REP_STOS>, REP, -                     Requires<[Not64BitMode]>; -  let Uses = [AX,ECX,EDI] in -  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", -                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, -                     Requires<[Not64BitMode]>; -  let Uses = [EAX,ECX,EDI] in -  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", -                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, -                     Requires<[Not64BitMode]>; -} - -let Defs = [RCX,RDI], isCodeGenOnly = 1 in { -  let Uses = [AL,RCX,RDI] in -  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", -                      [(X86rep_stos i8)], IIC_REP_STOS>, REP, -                     Requires<[In64BitMode]>; -  let Uses = [AX,RCX,RDI] in -  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", -                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, -                     Requires<[In64BitMode]>; -  let Uses = [RAX,RCX,RDI] in -  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", -                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, -                     Requires<[In64BitMode]>; - -  let Uses = [RAX,RCX,RDI] in -  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", -                      [(X86rep_stos i64)], IIC_REP_STOS>, REP, -                     Requires<[In64BitMode]>; -} -} // SchedRW - -//===----------------------------------------------------------------------===// -// Thread Local Storage Instructions -// - -// ELF TLS Support -// All calls clobber the non-callee saved registers. ESP is marked as -// a use to prevent stack-pointer assignments that appear immediately -// before calls from potentially appearing dead. -let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, -            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, -            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, -            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, -            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], -    Uses = [ESP] in { -def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), -                  "# TLS_addr32", -                  [(X86tlsaddr tls32addr:$sym)]>, -                  Requires<[Not64BitMode]>; -def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), -                  "# TLS_base_addr32", -                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>, -                  Requires<[Not64BitMode]>; -} - -// All calls clobber the non-callee saved registers. RSP is marked as -// a use to prevent stack-pointer assignments that appear immediately -// before calls from potentially appearing dead. -let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, -            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, -            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, -            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, -            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, -            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], -    Uses = [RSP] in { -def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), -                   "# TLS_addr64", -                  [(X86tlsaddr tls64addr:$sym)]>, -                  Requires<[In64BitMode]>; -def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), -                   "# TLS_base_addr64", -                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>, -                  Requires<[In64BitMode]>; -} - -// Darwin TLS Support -// For i386, the address of the thunk is passed on the stack, on return the -// address of the variable is in %eax.  %ecx is trashed during the function -// call.  All other registers are preserved. -let Defs = [EAX, ECX, EFLAGS], -    Uses = [ESP], -    usesCustomInserter = 1 in -def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), -                "# TLSCall_32", -                [(X86TLSCall addr:$sym)]>, -                Requires<[Not64BitMode]>; - -// For x86_64, the address of the thunk is passed in %rdi, on return -// the address of the variable is in %rax.  All other registers are preserved. -let Defs = [RAX, EFLAGS], -    Uses = [RSP, RDI], -    usesCustomInserter = 1 in -def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), -                  "# TLSCall_64", -                  [(X86TLSCall addr:$sym)]>, -                  Requires<[In64BitMode]>; - - -//===----------------------------------------------------------------------===// -// Conditional Move Pseudo Instructions - -// X86 doesn't have 8-bit conditional moves. Use a customInserter to -// emit control flow. An alternative to this is to mark i8 SELECT as Promote, -// however that requires promoting the operands, and can induce additional -// i8 register pressure. -let usesCustomInserter = 1, Uses = [EFLAGS] in { -def CMOV_GR8 : I<0, Pseudo, -                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), -                 "#CMOV_GR8 PSEUDO!", -                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, -                                          imm:$cond, EFLAGS))]>; - -let Predicates = [NoCMov] in { -def CMOV_GR32 : I<0, Pseudo, -                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), -                    "#CMOV_GR32* PSEUDO!", -                    [(set GR32:$dst, -                      (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; -def CMOV_GR16 : I<0, Pseudo, -                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), -                    "#CMOV_GR16* PSEUDO!", -                    [(set GR16:$dst, -                      (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; -} // Predicates = [NoCMov] - -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE1. -let Predicates = [FPStackf32] in -def CMOV_RFP32 : I<0, Pseudo, -                    (outs RFP32:$dst), -                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), -                    "#CMOV_RFP32 PSEUDO!", -                    [(set RFP32:$dst, -                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, -                                                  EFLAGS))]>; -// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no -// SSE2. -let Predicates = [FPStackf64] in -def CMOV_RFP64 : I<0, Pseudo, -                    (outs RFP64:$dst), -                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), -                    "#CMOV_RFP64 PSEUDO!", -                    [(set RFP64:$dst, -                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, -                                                  EFLAGS))]>; -def CMOV_RFP80 : I<0, Pseudo, -                    (outs RFP80:$dst), -                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), -                    "#CMOV_RFP80 PSEUDO!", -                    [(set RFP80:$dst, -                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, -                                                  EFLAGS))]>; -} // UsesCustomInserter = 1, Uses = [EFLAGS] - - -//===----------------------------------------------------------------------===// -// Normal-Instructions-With-Lock-Prefix Pseudo Instructions -//===----------------------------------------------------------------------===// - -// FIXME: Use normal instructions and add lock prefix dynamically. - -// Memory barriers - -// TODO: Get this to fold the constant into the instruction. -let isCodeGenOnly = 1, Defs = [EFLAGS] in -def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), -                      "or{l}\t{$zero, $dst|$dst, $zero}", -                      [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK, -                    Sched<[WriteALULd, WriteRMW]>; - -let hasSideEffects = 1 in -def Int_MemBarrier : I<0, Pseudo, (outs), (ins), -                     "#MEMBARRIER", -                     [(X86MemBarrier)]>, Sched<[WriteLoad]>; - -// RegOpc corresponds to the mr version of the instruction -// ImmOpc corresponds to the mi version of the instruction -// ImmOpc8 corresponds to the mi8 version of the instruction -// ImmMod corresponds to the instruction format of the mi and mi8 versions -multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, -                           Format ImmMod, string mnemonic> { -let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, -    SchedRW = [WriteALULd, WriteRMW] in { - -def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, -                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, -                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), -                  !strconcat(mnemonic, "{b}\t", -                             "{$src2, $dst|$dst, $src2}"), -                  [], IIC_ALU_NONMEM>, LOCK; -def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, -                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, -                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), -                   !strconcat(mnemonic, "{w}\t", -                              "{$src2, $dst|$dst, $src2}"), -                   [], IIC_ALU_NONMEM>, OpSize16, LOCK; -def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, -                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, -                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), -                   !strconcat(mnemonic, "{l}\t", -                              "{$src2, $dst|$dst, $src2}"), -                   [], IIC_ALU_NONMEM>, OpSize32, LOCK; -def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, -                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, -                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), -                    !strconcat(mnemonic, "{q}\t", -                               "{$src2, $dst|$dst, $src2}"), -                    [], IIC_ALU_NONMEM>, LOCK; - -def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, -                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, -                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), -                    !strconcat(mnemonic, "{b}\t", -                               "{$src2, $dst|$dst, $src2}"), -                    [], IIC_ALU_MEM>, LOCK; - -def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, -                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, -                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), -                      !strconcat(mnemonic, "{w}\t", -                                 "{$src2, $dst|$dst, $src2}"), -                      [], IIC_ALU_MEM>, OpSize16, LOCK; - -def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, -                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, -                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), -                      !strconcat(mnemonic, "{l}\t", -                                 "{$src2, $dst|$dst, $src2}"), -                      [], IIC_ALU_MEM>, OpSize32, LOCK; - -def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, -                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, -                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), -                          !strconcat(mnemonic, "{q}\t", -                                     "{$src2, $dst|$dst, $src2}"), -                          [], IIC_ALU_MEM>, LOCK; - -def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, -                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, -                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), -                      !strconcat(mnemonic, "{w}\t", -                                 "{$src2, $dst|$dst, $src2}"), -                      [], IIC_ALU_MEM>, OpSize16, LOCK; -def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, -                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, -                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), -                      !strconcat(mnemonic, "{l}\t", -                                 "{$src2, $dst|$dst, $src2}"), -                      [], IIC_ALU_MEM>, OpSize32, LOCK; -def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, -                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, -                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), -                       !strconcat(mnemonic, "{q}\t", -                                  "{$src2, $dst|$dst, $src2}"), -                       [], IIC_ALU_MEM>, LOCK; - -} - -} - -defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; -defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; -defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; -defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; -defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; - -// Optimized codegen when the non-memory output is not used. -multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, -                          string mnemonic> { -let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, -    SchedRW = [WriteALULd, WriteRMW] in { - -def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst), -                 !strconcat(mnemonic, "{b}\t$dst"), -                 [], IIC_UNARY_MEM>, LOCK; -def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), -                 !strconcat(mnemonic, "{w}\t$dst"), -                 [], IIC_UNARY_MEM>, OpSize16, LOCK; -def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), -                 !strconcat(mnemonic, "{l}\t$dst"), -                 [], IIC_UNARY_MEM>, OpSize32, LOCK; -def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), -                  !strconcat(mnemonic, "{q}\t$dst"), -                  [], IIC_UNARY_MEM>, LOCK; -} -} - -defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; -defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; - -// Atomic compare and swap. -multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, -                         SDPatternOperator frag, X86MemOperand x86memop, -                         InstrItinClass itin> { -let isCodeGenOnly = 1 in { -  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), -               !strconcat(mnemonic, "\t$ptr"), -               [(frag addr:$ptr)], itin>, TB, LOCK; -} -} - -multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, -                          string mnemonic, SDPatternOperator frag, -                          InstrItinClass itin8, InstrItinClass itin> { -let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { -  let Defs = [AL, EFLAGS], Uses = [AL] in -  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), -                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), -                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK; -  let Defs = [AX, EFLAGS], Uses = [AX] in -  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), -                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), -                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK; -  let Defs = [EAX, EFLAGS], Uses = [EAX] in -  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), -                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), -                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK; -  let Defs = [RAX, EFLAGS], Uses = [RAX] in -  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), -                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), -                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK; -} -} - -let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], -    SchedRW = [WriteALULd, WriteRMW] in { -defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", -                                X86cas8, i64mem, -                                IIC_CMPX_LOCK_8B>; -} - -let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], -    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { -defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", -                                 X86cas16, i128mem, -                                 IIC_CMPX_LOCK_16B>, REX_W; -} - -defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", -                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; - -// Atomic exchange and add -multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, -                             string frag, -                             InstrItinClass itin8, InstrItinClass itin> { -  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, -      SchedRW = [WriteALULd, WriteRMW] in { -    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst), -                    (ins GR8:$val, i8mem:$ptr), -                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), -                    [(set GR8:$dst, -                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], -                    itin8>; -    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), -                    (ins GR16:$val, i16mem:$ptr), -                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), -                    [(set -                       GR16:$dst, -                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], -                    itin>, OpSize16; -    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), -                    (ins GR32:$val, i32mem:$ptr), -                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), -                    [(set -                       GR32:$dst, -                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], -                    itin>, OpSize32; -    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), -                     (ins GR64:$val, i64mem:$ptr), -                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), -                     [(set -                        GR64:$dst, -                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], -                     itin>; -  } -} - -defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", -                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, -             TB, LOCK; - -/* The following multiclass tries to make sure that in code like - *    x.store (immediate op x.load(acquire), release) - * an operation directly on memory is generated instead of wasting a register. - * It is not automatic as atomic_store/load are only lowered to MOV instructions - * extremely late to prevent them from being accidentally reordered in the backend - * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) - */ -multiclass RELEASE_BINOP_MI<string op> { -    def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), -        "#RELEASE_BINOP PSEUDO!", -        [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) -            (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; -    // NAME#16 is not generated as 16-bit arithmetic instructions are considered -    // costly and avoided as far as possible by this backend anyway -    def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), -        "#RELEASE_BINOP PSEUDO!", -        [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) -            (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; -    def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), -        "#RELEASE_BINOP PSEUDO!", -        [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) -            (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; -} -defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; -defm RELEASE_AND : RELEASE_BINOP_MI<"and">; -defm RELEASE_OR  : RELEASE_BINOP_MI<"or">; -defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; -// Note: we don't deal with sub, because substractions of constants are -// optimized into additions before this code can run - -multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { -    def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), -        "#RELEASE_UNOP PSEUDO!", -        [(atomic_store_8 addr:$dst, dag8)]>; -    def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), -        "#RELEASE_UNOP PSEUDO!", -        [(atomic_store_16 addr:$dst, dag16)]>; -    def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), -        "#RELEASE_UNOP PSEUDO!", -        [(atomic_store_32 addr:$dst, dag32)]>; -    def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), -        "#RELEASE_UNOP PSEUDO!", -        [(atomic_store_64 addr:$dst, dag64)]>; -} - -defm RELEASE_INC : RELEASE_UNOP< -    (add (atomic_load_8  addr:$dst), (i8 1)), -    (add (atomic_load_16 addr:$dst), (i16 1)), -    (add (atomic_load_32 addr:$dst), (i32 1)), -    (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; -defm RELEASE_DEC : RELEASE_UNOP< -    (add (atomic_load_8  addr:$dst), (i8 -1)), -    (add (atomic_load_16 addr:$dst), (i16 -1)), -    (add (atomic_load_32 addr:$dst), (i32 -1)), -    (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; -/* -TODO: These don't work because the type inference of TableGen fails. -TODO: find a way to fix it. -defm RELEASE_NEG : RELEASE_UNOP< -    (ineg (atomic_load_8  addr:$dst)), -    (ineg (atomic_load_16 addr:$dst)), -    (ineg (atomic_load_32 addr:$dst)), -    (ineg (atomic_load_64 addr:$dst))>; -defm RELEASE_NOT : RELEASE_UNOP< -    (not (atomic_load_8  addr:$dst)), -    (not (atomic_load_16 addr:$dst)), -    (not (atomic_load_32 addr:$dst)), -    (not (atomic_load_64 addr:$dst))>; -*/ - -def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), -			"#RELEASE_MOV PSEUDO !", -			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>; -def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), -			"#RELEASE_MOV PSEUDO !", -			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>; -def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), -			"#RELEASE_MOV PSEUDO !", -			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>; -def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), -			"#RELEASE_MOV PSEUDO !", -			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; - -def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), -                        "#RELEASE_MOV PSEUDO!", -                        [(atomic_store_8  addr:$dst, GR8 :$src)]>; -def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), -                        "#RELEASE_MOV PSEUDO!", -                        [(atomic_store_16 addr:$dst, GR16:$src)]>; -def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), -                        "#RELEASE_MOV PSEUDO!", -                        [(atomic_store_32 addr:$dst, GR32:$src)]>; -def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), -                        "#RELEASE_MOV PSEUDO!", -                        [(atomic_store_64 addr:$dst, GR64:$src)]>; - -def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), -                      "#ACQUIRE_MOV PSEUDO!", -                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>; -def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), -                      "#ACQUIRE_MOV PSEUDO!", -                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>; -def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), -                      "#ACQUIRE_MOV PSEUDO!", -                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>; -def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), -                      "#ACQUIRE_MOV PSEUDO!", -                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>; -//===----------------------------------------------------------------------===// -// Conditional Move Pseudo Instructions. -//===----------------------------------------------------------------------===// - -// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after -// instruction selection into a branch sequence. -let Uses = [EFLAGS], usesCustomInserter = 1 in { -  def CMOV_FR32 : I<0, Pseudo, -                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), -                    "#CMOV_FR32 PSEUDO!", -                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, -                                                  EFLAGS))]>; -  def CMOV_FR64 : I<0, Pseudo, -                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), -                    "#CMOV_FR64 PSEUDO!", -                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, -                                                  EFLAGS))]>; -  def CMOV_V4F32 : I<0, Pseudo, -                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), -                    "#CMOV_V4F32 PSEUDO!", -                    [(set VR128:$dst, -                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V2F64 : I<0, Pseudo, -                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), -                    "#CMOV_V2F64 PSEUDO!", -                    [(set VR128:$dst, -                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V2I64 : I<0, Pseudo, -                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), -                    "#CMOV_V2I64 PSEUDO!", -                    [(set VR128:$dst, -                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V8F32 : I<0, Pseudo, -                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), -                    "#CMOV_V8F32 PSEUDO!", -                    [(set VR256:$dst, -                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V4F64 : I<0, Pseudo, -                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), -                    "#CMOV_V4F64 PSEUDO!", -                    [(set VR256:$dst, -                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V4I64 : I<0, Pseudo, -                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), -                    "#CMOV_V4I64 PSEUDO!", -                    [(set VR256:$dst, -                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V8I64 : I<0, Pseudo, -                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), -                    "#CMOV_V8I64 PSEUDO!", -                    [(set VR512:$dst, -                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V8F64 : I<0, Pseudo, -                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), -                    "#CMOV_V8F64 PSEUDO!", -                    [(set VR512:$dst, -                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, -                                          EFLAGS)))]>; -  def CMOV_V16F32 : I<0, Pseudo, -                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), -                    "#CMOV_V16F32 PSEUDO!", -                    [(set VR512:$dst, -                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, -                                          EFLAGS)))]>; -} - - -//===----------------------------------------------------------------------===// -// DAG Pattern Matching Rules -//===----------------------------------------------------------------------===// - -// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable -def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>; -def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>; -def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; -def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; -def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; -def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; - -def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), -          (ADD32ri GR32:$src1, tconstpool:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), -          (ADD32ri GR32:$src1, tjumptable:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), -          (ADD32ri GR32:$src1, tglobaladdr:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), -          (ADD32ri GR32:$src1, texternalsym:$src2)>; -def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), -          (ADD32ri GR32:$src1, tblockaddress:$src2)>; - -def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), -          (MOV32mi addr:$dst, tglobaladdr:$src)>; -def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), -          (MOV32mi addr:$dst, texternalsym:$src)>; -def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), -          (MOV32mi addr:$dst, tblockaddress:$src)>; - -// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small -// code model mode, should use 'movabs'.  FIXME: This is really a hack, the -//  'movabs' predicate should handle this sort of thing. -def : Pat<(i64 (X86Wrapper tconstpool  :$dst)), -          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tjumptable  :$dst)), -          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), -          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper texternalsym:$dst)), -          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; -def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), -          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; - -// In kernel code model, we can get the address of a label -// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of -// the MOV64ri32 should accept these. -def : Pat<(i64 (X86Wrapper tconstpool  :$dst)), -          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper tjumptable  :$dst)), -          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), -          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper texternalsym:$dst)), -          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; -def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), -          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; - -// If we have small model and -static mode, it is safe to store global addresses -// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate -// for MOV64mi32 should handle this sort of thing. -def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), -          (MOV64mi32 addr:$dst, tconstpool:$src)>, -          Requires<[NearData, IsStatic]>; -def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), -          (MOV64mi32 addr:$dst, tjumptable:$src)>, -          Requires<[NearData, IsStatic]>; -def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), -          (MOV64mi32 addr:$dst, tglobaladdr:$src)>, -          Requires<[NearData, IsStatic]>; -def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), -          (MOV64mi32 addr:$dst, texternalsym:$src)>, -          Requires<[NearData, IsStatic]>; -def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), -          (MOV64mi32 addr:$dst, tblockaddress:$src)>, -          Requires<[NearData, IsStatic]>; - -def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; -def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; - -// Calls - -// tls has some funny stuff here... -// This corresponds to movabs $foo@tpoff, %rax -def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), -          (MOV64ri32 tglobaltlsaddr :$dst)>; -// This corresponds to add $foo@tpoff, %rax -def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), -          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; - - -// Direct PC relative function call for small code model. 32-bit displacement -// sign extended to 64-bit. -def : Pat<(X86call (i64 tglobaladdr:$dst)), -          (CALL64pcrel32 tglobaladdr:$dst)>; -def : Pat<(X86call (i64 texternalsym:$dst)), -          (CALL64pcrel32 texternalsym:$dst)>; - -// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they -// can never use callee-saved registers. That is the purpose of the GR64_TC -// register classes. -// -// The only volatile register that is never used by the calling convention is -// %r11. This happens when calling a vararg function with 6 arguments. -// -// Match an X86tcret that uses less than 7 volatile registers. -def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), -                             (X86tcret node:$ptr, node:$off), [{ -  // X86tcret args: (*chain, ptr, imm, regs..., glue) -  unsigned NumRegs = 0; -  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) -    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) -      return false; -  return true; -}]>; - -def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), -          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, -          Requires<[Not64BitMode]>; - -// FIXME: This is disabled for 32-bit PIC mode because the global base -// register which is part of the address mode may be assigned a -// callee-saved register. -def : Pat<(X86tcret (load addr:$dst), imm:$off), -          (TCRETURNmi addr:$dst, imm:$off)>, -          Requires<[Not64BitMode, IsNotPIC]>; - -def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), -          (TCRETURNdi tglobaladdr:$dst, imm:$off)>, -          Requires<[NotLP64]>; - -def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), -          (TCRETURNdi texternalsym:$dst, imm:$off)>, -          Requires<[NotLP64]>; - -def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), -          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, -          Requires<[In64BitMode]>; - -// Don't fold loads into X86tcret requiring more than 6 regs. -// There wouldn't be enough scratch registers for base+index. -def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), -          (TCRETURNmi64 addr:$dst, imm:$off)>, -          Requires<[In64BitMode]>; - -def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), -          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, -          Requires<[IsLP64]>; - -def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), -          (TCRETURNdi64 texternalsym:$dst, imm:$off)>, -          Requires<[IsLP64]>; - -// Normal calls, with various flavors of addresses. -def : Pat<(X86call (i32 tglobaladdr:$dst)), -          (CALLpcrel32 tglobaladdr:$dst)>; -def : Pat<(X86call (i32 texternalsym:$dst)), -          (CALLpcrel32 texternalsym:$dst)>; -def : Pat<(X86call (i32 imm:$dst)), -          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; - -// Comparisons. - -// TEST R,R is smaller than CMP R,0 -def : Pat<(X86cmp GR8:$src1, 0), -          (TEST8rr GR8:$src1, GR8:$src1)>; -def : Pat<(X86cmp GR16:$src1, 0), -          (TEST16rr GR16:$src1, GR16:$src1)>; -def : Pat<(X86cmp GR32:$src1, 0), -          (TEST32rr GR32:$src1, GR32:$src1)>; -def : Pat<(X86cmp GR64:$src1, 0), -          (TEST64rr GR64:$src1, GR64:$src1)>; - -// Conditional moves with folded loads with operands swapped and conditions -// inverted. -multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, -                  Instruction Inst64> { -  let Predicates = [HasCMov] in { -    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), -              (Inst16 GR16:$src2, addr:$src1)>; -    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), -              (Inst32 GR32:$src2, addr:$src1)>; -    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), -              (Inst64 GR64:$src2, addr:$src1)>; -  } -} - -defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; -defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; -defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; -defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; -defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; -defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; -defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; -defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; -defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; -defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; -defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; -defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; -defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; -defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; -defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; -defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; - -// zextload bool -> zextload byte -def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>; -def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; -def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; -def : Pat<(zextloadi64i1 addr:$src), -          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; - -// extload bool -> extload byte -// When extloading from 16-bit and smaller memory locations into 64-bit -// registers, use zero-extending loads so that the entire 64-bit register is -// defined, avoiding partial-register updates. - -def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>; -def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>; -def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>; -def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>; -def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>; -def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; - -// For other extloads, use subregs, since the high contents of the register are -// defined after an extload. -def : Pat<(extloadi64i1 addr:$src), -          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; -def : Pat<(extloadi64i8 addr:$src), -          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; -def : Pat<(extloadi64i16 addr:$src), -          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; -def : Pat<(extloadi64i32 addr:$src), -          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; - -// anyext. Define these to do an explicit zero-extend to -// avoid partial-register updates. -def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG -                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>; -def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>; - -// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. -def : Pat<(i32 (anyext GR16:$src)), -          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; - -def : Pat<(i64 (anyext GR8 :$src)), -          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>; -def : Pat<(i64 (anyext GR16:$src)), -          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; -def : Pat<(i64 (anyext GR32:$src)), -          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; - - -// Any instruction that defines a 32-bit result leaves the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. And x86's cmov doesn't do anything if the -// condition is false. But any other 32-bit operation will zero-extend -// up to 64 bits. -def def32 : PatLeaf<(i32 GR32:$src), [{ -  return N->getOpcode() != ISD::TRUNCATE && -         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && -         N->getOpcode() != ISD::CopyFromReg && -         N->getOpcode() != ISD::AssertSext && -         N->getOpcode() != X86ISD::CMOV; -}]>; - -// In the case of a 32-bit def that is known to implicitly zero-extend, -// we can use a SUBREG_TO_REG. -def : Pat<(i64 (zext def32:$src)), -          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; - -//===----------------------------------------------------------------------===// -// Pattern match OR as ADD -//===----------------------------------------------------------------------===// - -// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be -// 3-addressified into an LEA instruction to avoid copies.  However, we also -// want to finally emit these instructions as an or at the end of the code -// generator to make the generated code easier to read.  To do this, we select -// into "disjoint bits" pseudo ops. - -// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. -def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ -  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) -    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); - -  APInt KnownZero0, KnownOne0; -  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0); -  APInt KnownZero1, KnownOne1; -  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0); -  return (~KnownZero0 & ~KnownZero1) == 0; -}]>; - - -// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. -// Try this before the selecting to OR. -let AddedComplexity = 5, SchedRW = [WriteALU] in { - -let isConvertibleToThreeAddress = 1, -    Constraints = "$src1 = $dst", Defs = [EFLAGS] in { -let isCommutable = 1 in { -def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), -                    "", // orw/addw REG, REG -                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; -def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), -                    "", // orl/addl REG, REG -                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; -def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), -                    "", // orq/addq REG, REG -                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; -} // isCommutable - -// NOTE: These are order specific, we want the ri8 forms to be listed -// first so that they are slightly preferred to the ri forms. - -def ADD16ri8_DB : I<0, Pseudo, -                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), -                    "", // orw/addw REG, imm8 -                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; -def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), -                    "", // orw/addw REG, imm -                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; - -def ADD32ri8_DB : I<0, Pseudo, -                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), -                    "", // orl/addl REG, imm8 -                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; -def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), -                    "", // orl/addl REG, imm -                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; - - -def ADD64ri8_DB : I<0, Pseudo, -                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), -                    "", // orq/addq REG, imm8 -                    [(set GR64:$dst, (or_is_add GR64:$src1, -                                                i64immSExt8:$src2))]>; -def ADD64ri32_DB : I<0, Pseudo, -                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), -                      "", // orq/addq REG, imm -                      [(set GR64:$dst, (or_is_add GR64:$src1, -                                                  i64immSExt32:$src2))]>; -} -} // AddedComplexity, SchedRW - - -//===----------------------------------------------------------------------===// -// Some peepholes -//===----------------------------------------------------------------------===// - -// Odd encoding trick: -128 fits into an 8-bit immediate field while -// +128 doesn't, so in this special case use a sub instead of an add. -def : Pat<(add GR16:$src1, 128), -          (SUB16ri8 GR16:$src1, -128)>; -def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), -          (SUB16mi8 addr:$dst, -128)>; - -def : Pat<(add GR32:$src1, 128), -          (SUB32ri8 GR32:$src1, -128)>; -def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), -          (SUB32mi8 addr:$dst, -128)>; - -def : Pat<(add GR64:$src1, 128), -          (SUB64ri8 GR64:$src1, -128)>; -def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), -          (SUB64mi8 addr:$dst, -128)>; - -// The same trick applies for 32-bit immediate fields in 64-bit -// instructions. -def : Pat<(add GR64:$src1, 0x0000000080000000), -          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; -def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), -          (SUB64mi32 addr:$dst, 0xffffffff80000000)>; - -// To avoid needing to materialize an immediate in a register, use a 32-bit and -// with implicit zero-extension instead of a 64-bit and if the immediate has at -// least 32 bits of leading zeros. If in addition the last 32 bits can be -// represented with a sign extension of a 8 bit constant, use that. - -def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), -          (SUBREG_TO_REG -            (i64 0), -            (AND32ri8 -              (EXTRACT_SUBREG GR64:$src, sub_32bit), -              (i32 (GetLo8XForm imm:$imm))), -            sub_32bit)>; - -def : Pat<(and GR64:$src, i64immZExt32:$imm), -          (SUBREG_TO_REG -            (i64 0), -            (AND32ri -              (EXTRACT_SUBREG GR64:$src, sub_32bit), -              (i32 (GetLo32XForm imm:$imm))), -            sub_32bit)>; - - -// r & (2^16-1) ==> movz -def : Pat<(and GR32:$src1, 0xffff), -          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; -// r & (2^8-1) ==> movz -def : Pat<(and GR32:$src1, 0xff), -          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, -                                                             GR32_ABCD)), -                                      sub_8bit))>, -      Requires<[Not64BitMode]>; -// r & (2^8-1) ==> movz -def : Pat<(and GR16:$src1, 0xff), -           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG -            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), -             sub_16bit)>, -      Requires<[Not64BitMode]>; - -// r & (2^32-1) ==> movz -def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), -          (SUBREG_TO_REG (i64 0), -                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), -                         sub_32bit)>; -// r & (2^16-1) ==> movz -def : Pat<(and GR64:$src, 0xffff), -          (SUBREG_TO_REG (i64 0), -                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), -                      sub_32bit)>; -// r & (2^8-1) ==> movz -def : Pat<(and GR64:$src, 0xff), -          (SUBREG_TO_REG (i64 0), -                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), -                         sub_32bit)>; -// r & (2^8-1) ==> movz -def : Pat<(and GR32:$src1, 0xff), -           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, -      Requires<[In64BitMode]>; -// r & (2^8-1) ==> movz -def : Pat<(and GR16:$src1, 0xff), -           (EXTRACT_SUBREG (MOVZX32rr8 (i8 -            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, -      Requires<[In64BitMode]>; - - -// sext_inreg patterns -def : Pat<(sext_inreg GR32:$src, i16), -          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; -def : Pat<(sext_inreg GR32:$src, i8), -          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, -                                                             GR32_ABCD)), -                                      sub_8bit))>, -      Requires<[Not64BitMode]>; - -def : Pat<(sext_inreg GR16:$src, i8), -           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG -            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), -             sub_16bit)>, -      Requires<[Not64BitMode]>; - -def : Pat<(sext_inreg GR64:$src, i32), -          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; -def : Pat<(sext_inreg GR64:$src, i16), -          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; -def : Pat<(sext_inreg GR64:$src, i8), -          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; -def : Pat<(sext_inreg GR32:$src, i8), -          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, -      Requires<[In64BitMode]>; -def : Pat<(sext_inreg GR16:$src, i8), -           (EXTRACT_SUBREG (MOVSX32rr8 -            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, -      Requires<[In64BitMode]>; - -// sext, sext_load, zext, zext_load -def: Pat<(i16 (sext GR8:$src)), -          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; -def: Pat<(sextloadi16i8 addr:$src), -          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; -def: Pat<(i16 (zext GR8:$src)), -          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; -def: Pat<(zextloadi16i8 addr:$src), -          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; - -// trunc patterns -def : Pat<(i16 (trunc GR32:$src)), -          (EXTRACT_SUBREG GR32:$src, sub_16bit)>; -def : Pat<(i8 (trunc GR32:$src)), -          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), -                          sub_8bit)>, -      Requires<[Not64BitMode]>; -def : Pat<(i8 (trunc GR16:$src)), -          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                          sub_8bit)>, -      Requires<[Not64BitMode]>; -def : Pat<(i32 (trunc GR64:$src)), -          (EXTRACT_SUBREG GR64:$src, sub_32bit)>; -def : Pat<(i16 (trunc GR64:$src)), -          (EXTRACT_SUBREG GR64:$src, sub_16bit)>; -def : Pat<(i8 (trunc GR64:$src)), -          (EXTRACT_SUBREG GR64:$src, sub_8bit)>; -def : Pat<(i8 (trunc GR32:$src)), -          (EXTRACT_SUBREG GR32:$src, sub_8bit)>, -      Requires<[In64BitMode]>; -def : Pat<(i8 (trunc GR16:$src)), -          (EXTRACT_SUBREG GR16:$src, sub_8bit)>, -      Requires<[In64BitMode]>; - -// h-register tricks -def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), -          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                          sub_8bit_hi)>, -      Requires<[Not64BitMode]>; -def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), -          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), -                          sub_8bit_hi)>, -      Requires<[Not64BitMode]>; -def : Pat<(srl GR16:$src, (i8 8)), -          (EXTRACT_SUBREG -            (MOVZX32rr8 -              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                              sub_8bit_hi)), -            sub_16bit)>, -      Requires<[Not64BitMode]>; -def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), -          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, -                                                             GR16_ABCD)), -                                      sub_8bit_hi))>, -      Requires<[Not64BitMode]>; -def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), -          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, -                                                             GR16_ABCD)), -                                      sub_8bit_hi))>, -      Requires<[Not64BitMode]>; -def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), -          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, -                                                             GR32_ABCD)), -                                      sub_8bit_hi))>, -      Requires<[Not64BitMode]>; -def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), -          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, -                                                             GR32_ABCD)), -                                      sub_8bit_hi))>, -      Requires<[Not64BitMode]>; - -// h-register tricks. -// For now, be conservative on x86-64 and use an h-register extract only if the -// value is immediately zero-extended or stored, which are somewhat common -// cases. This uses a bunch of code to prevent a register requiring a REX prefix -// from being allocated in the same instruction as the h register, as there's -// currently no way to describe this requirement to the register allocator. - -// h-register extract and zero-extend. -def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), -          (SUBREG_TO_REG -            (i64 0), -            (MOVZX32_NOREXrr8 -              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), -                              sub_8bit_hi)), -            sub_32bit)>; -def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), -          (MOVZX32_NOREXrr8 -            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), -                            sub_8bit_hi))>, -      Requires<[In64BitMode]>; -def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), -          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, -                                                                   GR32_ABCD)), -                                             sub_8bit_hi))>, -      Requires<[In64BitMode]>; -def : Pat<(srl GR16:$src, (i8 8)), -          (EXTRACT_SUBREG -            (MOVZX32_NOREXrr8 -              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                              sub_8bit_hi)), -            sub_16bit)>, -      Requires<[In64BitMode]>; -def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), -          (MOVZX32_NOREXrr8 -            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                            sub_8bit_hi))>, -      Requires<[In64BitMode]>; -def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), -          (MOVZX32_NOREXrr8 -            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                            sub_8bit_hi))>, -      Requires<[In64BitMode]>; -def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), -          (SUBREG_TO_REG -            (i64 0), -            (MOVZX32_NOREXrr8 -              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                              sub_8bit_hi)), -            sub_32bit)>; -def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), -          (SUBREG_TO_REG -            (i64 0), -            (MOVZX32_NOREXrr8 -              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                              sub_8bit_hi)), -            sub_32bit)>; - -// h-register extract and store. -def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), -          (MOV8mr_NOREX -            addr:$dst, -            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), -                            sub_8bit_hi))>; -def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), -          (MOV8mr_NOREX -            addr:$dst, -            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), -                            sub_8bit_hi))>, -      Requires<[In64BitMode]>; -def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), -          (MOV8mr_NOREX -            addr:$dst, -            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), -                            sub_8bit_hi))>, -      Requires<[In64BitMode]>; - - -// (shl x, 1) ==> (add x, x) -// Note that if x is undef (immediate or otherwise), we could theoretically -// end up with the two uses of x getting different values, producing a result -// where the least significant bit is not 0. However, the probability of this -// happening is considered low enough that this is officially not a -// "real problem". -def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>; -def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; -def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; -def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; - -// Helper imms that check if a mask doesn't change significant shift bits. -def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; -def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; - -// Shift amount is implicitly masked. -multiclass MaskedShiftAmountPats<SDNode frag, string name> { -  // (shift x (and y, 31)) ==> (shift x, y) -  def : Pat<(frag GR8:$src1, (and CL, immShift32)), -            (!cast<Instruction>(name # "8rCL") GR8:$src1)>; -  def : Pat<(frag GR16:$src1, (and CL, immShift32)), -            (!cast<Instruction>(name # "16rCL") GR16:$src1)>; -  def : Pat<(frag GR32:$src1, (and CL, immShift32)), -            (!cast<Instruction>(name # "32rCL") GR32:$src1)>; -  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), -            (!cast<Instruction>(name # "8mCL") addr:$dst)>; -  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), -            (!cast<Instruction>(name # "16mCL") addr:$dst)>; -  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), -            (!cast<Instruction>(name # "32mCL") addr:$dst)>; - -  // (shift x (and y, 63)) ==> (shift x, y) -  def : Pat<(frag GR64:$src1, (and CL, immShift64)), -            (!cast<Instruction>(name # "64rCL") GR64:$src1)>; -  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst), -            (!cast<Instruction>(name # "64mCL") addr:$dst)>; -} - -defm : MaskedShiftAmountPats<shl, "SHL">; -defm : MaskedShiftAmountPats<srl, "SHR">; -defm : MaskedShiftAmountPats<sra, "SAR">; -defm : MaskedShiftAmountPats<rotl, "ROL">; -defm : MaskedShiftAmountPats<rotr, "ROR">; - -// (anyext (setcc_carry)) -> (setcc_carry) -def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C16r)>; -def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C32r)>; -def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), -          (SETB_C32r)>; - - - - -//===----------------------------------------------------------------------===// -// EFLAGS-defining Patterns -//===----------------------------------------------------------------------===// - -// add reg, reg -def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>; -def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; -def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; - -// add reg, mem -def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), -          (ADD8rm GR8:$src1, addr:$src2)>; -def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), -          (ADD16rm GR16:$src1, addr:$src2)>; -def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), -          (ADD32rm GR32:$src1, addr:$src2)>; - -// add reg, imm -def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>; -def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; -def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; -def : Pat<(add GR16:$src1, i16immSExt8:$src2), -          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(add GR32:$src1, i32immSExt8:$src2), -          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; - -// sub reg, reg -def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>; -def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; -def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; - -// sub reg, mem -def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), -          (SUB8rm GR8:$src1, addr:$src2)>; -def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), -          (SUB16rm GR16:$src1, addr:$src2)>; -def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), -          (SUB32rm GR32:$src1, addr:$src2)>; - -// sub reg, imm -def : Pat<(sub GR8:$src1, imm:$src2), -          (SUB8ri GR8:$src1, imm:$src2)>; -def : Pat<(sub GR16:$src1, imm:$src2), -          (SUB16ri GR16:$src1, imm:$src2)>; -def : Pat<(sub GR32:$src1, imm:$src2), -          (SUB32ri GR32:$src1, imm:$src2)>; -def : Pat<(sub GR16:$src1, i16immSExt8:$src2), -          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(sub GR32:$src1, i32immSExt8:$src2), -          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; - -// sub 0, reg -def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>; -def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; -def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; -def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; - -// mul reg, reg -def : Pat<(mul GR16:$src1, GR16:$src2), -          (IMUL16rr GR16:$src1, GR16:$src2)>; -def : Pat<(mul GR32:$src1, GR32:$src2), -          (IMUL32rr GR32:$src1, GR32:$src2)>; - -// mul reg, mem -def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), -          (IMUL16rm GR16:$src1, addr:$src2)>; -def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), -          (IMUL32rm GR32:$src1, addr:$src2)>; - -// mul reg, imm -def : Pat<(mul GR16:$src1, imm:$src2), -          (IMUL16rri GR16:$src1, imm:$src2)>; -def : Pat<(mul GR32:$src1, imm:$src2), -          (IMUL32rri GR32:$src1, imm:$src2)>; -def : Pat<(mul GR16:$src1, i16immSExt8:$src2), -          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(mul GR32:$src1, i32immSExt8:$src2), -          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; - -// reg = mul mem, imm -def : Pat<(mul (loadi16 addr:$src1), imm:$src2), -          (IMUL16rmi addr:$src1, imm:$src2)>; -def : Pat<(mul (loadi32 addr:$src1), imm:$src2), -          (IMUL32rmi addr:$src1, imm:$src2)>; -def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), -          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; -def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), -          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; - -// Patterns for nodes that do not produce flags, for instructions that do. - -// addition -def : Pat<(add GR64:$src1, GR64:$src2), -          (ADD64rr GR64:$src1, GR64:$src2)>; -def : Pat<(add GR64:$src1, i64immSExt8:$src2), -          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(add GR64:$src1, i64immSExt32:$src2), -          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; -def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), -          (ADD64rm GR64:$src1, addr:$src2)>; - -// subtraction -def : Pat<(sub GR64:$src1, GR64:$src2), -          (SUB64rr GR64:$src1, GR64:$src2)>; -def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), -          (SUB64rm GR64:$src1, addr:$src2)>; -def : Pat<(sub GR64:$src1, i64immSExt8:$src2), -          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(sub GR64:$src1, i64immSExt32:$src2), -          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Multiply -def : Pat<(mul GR64:$src1, GR64:$src2), -          (IMUL64rr GR64:$src1, GR64:$src2)>; -def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), -          (IMUL64rm GR64:$src1, addr:$src2)>; -def : Pat<(mul GR64:$src1, i64immSExt8:$src2), -          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(mul GR64:$src1, i64immSExt32:$src2), -          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; -def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), -          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; -def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), -          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; - -// Increment/Decrement reg. -// Do not make INC/DEC if it is slow -let Predicates = [NotSlowIncDec] in { -  def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>; -  def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>; -  def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>; -  def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>; -  def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>; -  def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; -  def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; -  def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; -} - -// or reg/reg. -def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>; -def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; -def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; -def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; - -// or reg/mem -def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), -          (OR8rm GR8:$src1, addr:$src2)>; -def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), -          (OR16rm GR16:$src1, addr:$src2)>; -def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), -          (OR32rm GR32:$src1, addr:$src2)>; -def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), -          (OR64rm GR64:$src1, addr:$src2)>; - -// or reg/imm -def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>; -def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; -def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; -def : Pat<(or GR16:$src1, i16immSExt8:$src2), -          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(or GR32:$src1, i32immSExt8:$src2), -          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(or GR64:$src1, i64immSExt8:$src2), -          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(or GR64:$src1, i64immSExt32:$src2), -          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// xor reg/reg -def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>; -def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; -def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; -def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; - -// xor reg/mem -def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), -          (XOR8rm GR8:$src1, addr:$src2)>; -def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), -          (XOR16rm GR16:$src1, addr:$src2)>; -def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), -          (XOR32rm GR32:$src1, addr:$src2)>; -def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), -          (XOR64rm GR64:$src1, addr:$src2)>; - -// xor reg/imm -def : Pat<(xor GR8:$src1, imm:$src2), -          (XOR8ri GR8:$src1, imm:$src2)>; -def : Pat<(xor GR16:$src1, imm:$src2), -          (XOR16ri GR16:$src1, imm:$src2)>; -def : Pat<(xor GR32:$src1, imm:$src2), -          (XOR32ri GR32:$src1, imm:$src2)>; -def : Pat<(xor GR16:$src1, i16immSExt8:$src2), -          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(xor GR32:$src1, i32immSExt8:$src2), -          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(xor GR64:$src1, i64immSExt8:$src2), -          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(xor GR64:$src1, i64immSExt32:$src2), -          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// and reg/reg -def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>; -def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; -def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; -def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; - -// and reg/mem -def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), -          (AND8rm GR8:$src1, addr:$src2)>; -def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), -          (AND16rm GR16:$src1, addr:$src2)>; -def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), -          (AND32rm GR32:$src1, addr:$src2)>; -def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), -          (AND64rm GR64:$src1, addr:$src2)>; - -// and reg/imm -def : Pat<(and GR8:$src1, imm:$src2), -          (AND8ri GR8:$src1, imm:$src2)>; -def : Pat<(and GR16:$src1, imm:$src2), -          (AND16ri GR16:$src1, imm:$src2)>; -def : Pat<(and GR32:$src1, imm:$src2), -          (AND32ri GR32:$src1, imm:$src2)>; -def : Pat<(and GR16:$src1, i16immSExt8:$src2), -          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; -def : Pat<(and GR32:$src1, i32immSExt8:$src2), -          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; -def : Pat<(and GR64:$src1, i64immSExt8:$src2), -          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(and GR64:$src1, i64immSExt32:$src2), -          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; - -// Bit scan instruction patterns to match explicit zero-undef behavior. -def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; -def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; -def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; -def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; -def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; -def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; - -// When HasMOVBE is enabled it is possible to get a non-legalized -// register-register 16 bit bswap. This maps it to a ROL instruction. -let Predicates = [HasMOVBE] in { - def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; -} +//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
 +//
 +//                     The LLVM Compiler Infrastructure
 +//
 +// This file is distributed under the University of Illinois Open Source
 +// License. See LICENSE.TXT for details.
 +//
 +//===----------------------------------------------------------------------===//
 +//
 +// This file describes the various pseudo instructions used by the compiler,
 +// as well as Pat patterns used during instruction selection.
 +//
 +//===----------------------------------------------------------------------===//
 +
 +//===----------------------------------------------------------------------===//
 +// Pattern Matching Support
 +
 +def GetLo32XForm : SDNodeXForm<imm, [{
 +  // Transformation function: get the low 32 bits.
 +  return getI32Imm((unsigned)N->getZExtValue());
 +}]>;
 +
 +def GetLo8XForm : SDNodeXForm<imm, [{
 +  // Transformation function: get the low 8 bits.
 +  return getI8Imm((uint8_t)N->getZExtValue());
 +}]>;
 +
 +
 +//===----------------------------------------------------------------------===//
 +// Random Pseudo Instructions.
 +
 +// PIC base construction.  This expands to code that looks like this:
 +//     call  $next_inst
 +//     popl %destreg"
 +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
 +  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
 +                      "", []>;
 +
 +
 +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
 +// a stack adjustment and the codegen must know that they may modify the stack
 +// pointer before prolog-epilog rewriting occurs.
 +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 +// sub / add which can clobber EFLAGS.
 +let Defs = [ESP, EFLAGS], Uses = [ESP] in {
 +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 +                           "#ADJCALLSTACKDOWN",
 +                           []>,
 +                          Requires<[NotLP64]>;
 +def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 +                           "#ADJCALLSTACKUP",
 +                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
 +                          Requires<[NotLP64]>;
 +}
 +def : Pat<(X86callseq_start timm:$amt1),
 +          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
 +
 +
 +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
 +// a stack adjustment and the codegen must know that they may modify the stack
 +// pointer before prolog-epilog rewriting occurs.
 +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 +// sub / add which can clobber EFLAGS.
 +let Defs = [RSP, EFLAGS], Uses = [RSP] in {
 +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 +                           "#ADJCALLSTACKDOWN",
 +                           []>,
 +                          Requires<[IsLP64]>;
 +def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
 +                           "#ADJCALLSTACKUP",
 +                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
 +                          Requires<[IsLP64]>;
 +}
 +def : Pat<(X86callseq_start timm:$amt1),
 +          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
 +
 +
 +// x86-64 va_start lowering magic.
 +let usesCustomInserter = 1, Defs = [EFLAGS] in {
 +def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
 +                              (outs),
 +                              (ins GR8:$al,
 +                                   i64imm:$regsavefi, i64imm:$offset,
 +                                   variable_ops),
 +                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
 +                              [(X86vastart_save_xmm_regs GR8:$al,
 +                                                         imm:$regsavefi,
 +                                                         imm:$offset),
 +                               (implicit EFLAGS)]>;
 +
 +// The VAARG_64 pseudo-instruction takes the address of the va_list,
 +// and places the address of the next argument into a register.
 +let Defs = [EFLAGS] in
 +def VAARG_64 : I<0, Pseudo,
 +                 (outs GR64:$dst),
 +                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
 +                 "#VAARG_64 $dst, $ap, $size, $mode, $align",
 +                 [(set GR64:$dst,
 +                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
 +                  (implicit EFLAGS)]>;
 +
 +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
 +// targets.  These calls are needed to probe the stack when allocating more than
 +// 4k bytes in one go. Touching the stack at 4K increments is necessary to
 +// ensure that the guard pages used by the OS virtual memory manager are
 +// allocated in correct sequence.
 +// The main point of having separate instruction are extra unmodelled effects
 +// (compared to ordinary calls) like stack pointer change.
 +
 +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
 +  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
 +                     "# dynamic stack allocation",
 +                     [(X86WinAlloca)]>;
 +
 +// When using segmented stacks these are lowered into instructions which first
 +// check if the current stacklet has enough free memory. If it does, memory is
 +// allocated by bumping the stack pointer. Otherwise memory is allocated from
 +// the heap.
 +
 +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
 +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
 +                      "# variable sized alloca for segmented stacks",
 +                      [(set GR32:$dst,
 +                         (X86SegAlloca GR32:$size))]>,
 +                    Requires<[NotLP64]>;
 +
 +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
 +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
 +                      "# variable sized alloca for segmented stacks",
 +                      [(set GR64:$dst,
 +                         (X86SegAlloca GR64:$size))]>,
 +                    Requires<[In64BitMode]>;
 +}
 +
 +// The MSVC runtime contains an _ftol2 routine for converting floating-point
 +// to integer values. It has a strange calling convention: the input is
 +// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
 +// used as a temporary register. No other registers (aside from flags) are
 +// touched.
 +// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
 +// variant is unnecessary.
 +
 +let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
 +  def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
 +                      "# win32 fptoui",
 +                      [(X86WinFTOL RFP32:$src)]>,
 +                    Requires<[Not64BitMode]>;
 +
 +  def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
 +                      "# win32 fptoui",
 +                      [(X86WinFTOL RFP64:$src)]>,
 +                    Requires<[Not64BitMode]>;
 +}
 +
 +//===----------------------------------------------------------------------===//
 +// EH Pseudo Instructions
 +//
 +let SchedRW = [WriteSystem] in {
 +let isTerminator = 1, isReturn = 1, isBarrier = 1,
 +    hasCtrlDep = 1, isCodeGenOnly = 1 in {
 +def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
 +                    "ret\t#eh_return, addr: $addr",
 +                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
 +
 +}
 +
 +let isTerminator = 1, isReturn = 1, isBarrier = 1,
 +    hasCtrlDep = 1, isCodeGenOnly = 1 in {
 +def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
 +                     "ret\t#eh_return, addr: $addr",
 +                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
 +
 +}
 +
 +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
 +    usesCustomInserter = 1 in {
 +  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
 +                            "#EH_SJLJ_SETJMP32",
 +                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
 +                          Requires<[Not64BitMode]>;
 +  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
 +                            "#EH_SJLJ_SETJMP64",
 +                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
 +                          Requires<[In64BitMode]>;
 +  let isTerminator = 1 in {
 +  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
 +                            "#EH_SJLJ_LONGJMP32",
 +                            [(X86eh_sjlj_longjmp addr:$buf)]>,
 +                          Requires<[Not64BitMode]>;
 +  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
 +                            "#EH_SJLJ_LONGJMP64",
 +                            [(X86eh_sjlj_longjmp addr:$buf)]>,
 +                          Requires<[In64BitMode]>;
 +  }
 +}
 +} // SchedRW
 +
 +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
 +  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
 +                        "#EH_SjLj_Setup\t$dst", []>;
 +}
 +
 +//===----------------------------------------------------------------------===//
 +// Pseudo instructions used by unwind info.
 +//
 +let isPseudo = 1 in {
 +  def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
 +                            "#SEH_PushReg $reg", []>;
 +  def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
 +                            "#SEH_SaveReg $reg, $dst", []>;
 +  def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
 +                            "#SEH_SaveXMM $reg, $dst", []>;
 +  def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
 +                            "#SEH_StackAlloc $size", []>;
 +  def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
 +                            "#SEH_SetFrame $reg, $offset", []>;
 +  def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
 +                            "#SEH_PushFrame $mode", []>;
 +  def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
 +                            "#SEH_EndPrologue", []>;
 +  def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
 +                            "#SEH_Epilogue", []>;
 +}
 +
 +//===----------------------------------------------------------------------===//
 +// Pseudo instructions used by segmented stacks.
 +//
 +
 +// This is lowered into a RET instruction by MCInstLower.  We need
 +// this so that we don't have to have a MachineBasicBlock which ends
 +// with a RET and also has successors.
 +let isPseudo = 1 in {
 +def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
 +                          "", []>;
 +
 +// This instruction is lowered to a RET followed by a MOV.  The two
 +// instructions are not generated on a higher level since then the
 +// verifier sees a MachineBasicBlock ending with a non-terminator.
 +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 +                                  "", []>;
 +}
 +
 +//===----------------------------------------------------------------------===//
 +// Alias Instructions
 +//===----------------------------------------------------------------------===//
 +
 +// Alias instruction mapping movr0 to xor.
 +// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
 +    isPseudo = 1 in
 +def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
 +                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 +
 +// Other widths can also make use of the 32-bit xor, which may have a smaller
 +// encoding and avoid partial register updates.
 +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
 +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
 +  let AddedComplexity = 20;
 +}
 +
 +// Materialize i64 constant where top 32-bits are zero. This could theoretically
 +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 +// that would make it more difficult to rematerialize.
 +let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
 +    isCodeGenOnly = 1, hasSideEffects = 0 in
 +def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
 +                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
 +
 +// This 64-bit pseudo-move can be used for both a 64-bit constant that is
 +// actually the zero-extension of a 32-bit constant, and for labels in the
 +// x86-64 small code model.
 +def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
 +
 +let AddedComplexity = 1 in
 +def : Pat<(i64 mov64imm32:$src),
 +          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
 +
 +// Use sbb to materialize carry bit.
 +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
 +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 +// However, Pat<> can't replicate the destination reg into the inputs of the
 +// result.
 +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
 +                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
 +                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
 +                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
 +                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 +} // isCodeGenOnly
 +
 +
 +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C16r)>;
 +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C32r)>;
 +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C64r)>;
 +
 +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C16r)>;
 +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C32r)>;
 +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C64r)>;
 +
 +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
 +// will be eliminated and that the sbb can be extended up to a wider type.  When
 +// this happens, it is great.  However, if we are left with an 8-bit sbb and an
 +// and, we might as well just match it as a setb.
 +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
 +          (SETBr)>;
 +
 +// (add OP, SETB) -> (adc OP, 0)
 +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
 +          (ADC8ri GR8:$op, 0)>;
 +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
 +          (ADC32ri8 GR32:$op, 0)>;
 +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
 +          (ADC64ri8 GR64:$op, 0)>;
 +
 +// (sub OP, SETB) -> (sbb OP, 0)
 +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
 +          (SBB8ri GR8:$op, 0)>;
 +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
 +          (SBB32ri8 GR32:$op, 0)>;
 +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
 +          (SBB64ri8 GR64:$op, 0)>;
 +
 +// (sub OP, SETCC_CARRY) -> (adc OP, 0)
 +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
 +          (ADC8ri GR8:$op, 0)>;
 +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
 +          (ADC32ri8 GR32:$op, 0)>;
 +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
 +          (ADC64ri8 GR64:$op, 0)>;
 +
 +//===----------------------------------------------------------------------===//
 +// String Pseudo Instructions
 +//
 +let SchedRW = [WriteMicrocoded] in {
 +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
 +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
 +                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
 +                   Requires<[Not64BitMode]>;
 +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
 +                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
 +                   Requires<[Not64BitMode]>;
 +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
 +                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
 +                   Requires<[Not64BitMode]>;
 +}
 +
 +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
 +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
 +                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
 +                   Requires<[In64BitMode]>;
 +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
 +                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
 +                   Requires<[In64BitMode]>;
 +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
 +                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
 +                   Requires<[In64BitMode]>;
 +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
 +                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
 +                   Requires<[In64BitMode]>;
 +}
 +
 +// FIXME: Should use "(X86rep_stos AL)" as the pattern.
 +let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
 +  let Uses = [AL,ECX,EDI] in
 +  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
 +                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
 +                     Requires<[Not64BitMode]>;
 +  let Uses = [AX,ECX,EDI] in
 +  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
 +                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
 +                     Requires<[Not64BitMode]>;
 +  let Uses = [EAX,ECX,EDI] in
 +  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
 +                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
 +                     Requires<[Not64BitMode]>;
 +}
 +
 +let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
 +  let Uses = [AL,RCX,RDI] in
 +  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
 +                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
 +                     Requires<[In64BitMode]>;
 +  let Uses = [AX,RCX,RDI] in
 +  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
 +                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
 +                     Requires<[In64BitMode]>;
 +  let Uses = [RAX,RCX,RDI] in
 +  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
 +                      [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
 +                     Requires<[In64BitMode]>;
 +
 +  let Uses = [RAX,RCX,RDI] in
 +  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
 +                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
 +                     Requires<[In64BitMode]>;
 +}
 +} // SchedRW
 +
 +//===----------------------------------------------------------------------===//
 +// Thread Local Storage Instructions
 +//
 +
 +// ELF TLS Support
 +// All calls clobber the non-callee saved registers. ESP is marked as
 +// a use to prevent stack-pointer assignments that appear immediately
 +// before calls from potentially appearing dead.
 +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
 +            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
 +            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
 +            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
 +            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
 +    Uses = [ESP] in {
 +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
 +                  "# TLS_addr32",
 +                  [(X86tlsaddr tls32addr:$sym)]>,
 +                  Requires<[Not64BitMode]>;
 +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
 +                  "# TLS_base_addr32",
 +                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
 +                  Requires<[Not64BitMode]>;
 +}
 +
 +// All calls clobber the non-callee saved registers. RSP is marked as
 +// a use to prevent stack-pointer assignments that appear immediately
 +// before calls from potentially appearing dead.
 +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
 +            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
 +            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
 +            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
 +            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
 +            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
 +    Uses = [RSP] in {
 +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 +                   "# TLS_addr64",
 +                  [(X86tlsaddr tls64addr:$sym)]>,
 +                  Requires<[In64BitMode]>;
 +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 +                   "# TLS_base_addr64",
 +                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
 +                  Requires<[In64BitMode]>;
 +}
 +
 +// Darwin TLS Support
 +// For i386, the address of the thunk is passed on the stack, on return the
 +// address of the variable is in %eax.  %ecx is trashed during the function
 +// call.  All other registers are preserved.
 +let Defs = [EAX, ECX, EFLAGS],
 +    Uses = [ESP],
 +    usesCustomInserter = 1 in
 +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
 +                "# TLSCall_32",
 +                [(X86TLSCall addr:$sym)]>,
 +                Requires<[Not64BitMode]>;
 +
 +// For x86_64, the address of the thunk is passed in %rdi, on return
 +// the address of the variable is in %rax.  All other registers are preserved.
 +let Defs = [RAX, EFLAGS],
 +    Uses = [RSP, RDI],
 +    usesCustomInserter = 1 in
 +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 +                  "# TLSCall_64",
 +                  [(X86TLSCall addr:$sym)]>,
 +                  Requires<[In64BitMode]>;
 +
 +
 +//===----------------------------------------------------------------------===//
 +// Conditional Move Pseudo Instructions
 +
 +// X86 doesn't have 8-bit conditional moves. Use a customInserter to
 +// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
 +// however that requires promoting the operands, and can induce additional
 +// i8 register pressure.
 +let usesCustomInserter = 1, Uses = [EFLAGS] in {
 +def CMOV_GR8 : I<0, Pseudo,
 +                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
 +                 "#CMOV_GR8 PSEUDO!",
 +                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
 +                                          imm:$cond, EFLAGS))]>;
 +
 +let Predicates = [NoCMov] in {
 +def CMOV_GR32 : I<0, Pseudo,
 +                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
 +                    "#CMOV_GR32* PSEUDO!",
 +                    [(set GR32:$dst,
 +                      (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
 +def CMOV_GR16 : I<0, Pseudo,
 +                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
 +                    "#CMOV_GR16* PSEUDO!",
 +                    [(set GR16:$dst,
 +                      (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
 +} // Predicates = [NoCMov]
 +
 +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
 +// SSE1.
 +let Predicates = [FPStackf32] in
 +def CMOV_RFP32 : I<0, Pseudo,
 +                    (outs RFP32:$dst),
 +                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
 +                    "#CMOV_RFP32 PSEUDO!",
 +                    [(set RFP32:$dst,
 +                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
 +                                                  EFLAGS))]>;
 +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
 +// SSE2.
 +let Predicates = [FPStackf64] in
 +def CMOV_RFP64 : I<0, Pseudo,
 +                    (outs RFP64:$dst),
 +                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
 +                    "#CMOV_RFP64 PSEUDO!",
 +                    [(set RFP64:$dst,
 +                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
 +                                                  EFLAGS))]>;
 +def CMOV_RFP80 : I<0, Pseudo,
 +                    (outs RFP80:$dst),
 +                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
 +                    "#CMOV_RFP80 PSEUDO!",
 +                    [(set RFP80:$dst,
 +                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
 +                                                  EFLAGS))]>;
 +} // UsesCustomInserter = 1, Uses = [EFLAGS]
 +
 +
 +//===----------------------------------------------------------------------===//
 +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 +//===----------------------------------------------------------------------===//
 +
 +// FIXME: Use normal instructions and add lock prefix dynamically.
 +
 +// Memory barriers
 +
 +// TODO: Get this to fold the constant into the instruction.
 +let isCodeGenOnly = 1, Defs = [EFLAGS] in
 +def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
 +                      "or{l}\t{$zero, $dst|$dst, $zero}",
 +                      [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
 +                    Sched<[WriteALULd, WriteRMW]>;
 +
 +let hasSideEffects = 1 in
 +def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
 +                     "#MEMBARRIER",
 +                     [(X86MemBarrier)]>, Sched<[WriteLoad]>;
 +
 +// RegOpc corresponds to the mr version of the instruction
 +// ImmOpc corresponds to the mi version of the instruction
 +// ImmOpc8 corresponds to the mi8 version of the instruction
 +// ImmMod corresponds to the instruction format of the mi and mi8 versions
 +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
 +                           Format ImmMod, string mnemonic> {
 +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
 +    SchedRW = [WriteALULd, WriteRMW] in {
 +
 +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
 +                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
 +                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
 +                  !strconcat(mnemonic, "{b}\t",
 +                             "{$src2, $dst|$dst, $src2}"),
 +                  [], IIC_ALU_NONMEM>, LOCK;
 +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
 +                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
 +                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
 +                   !strconcat(mnemonic, "{w}\t",
 +                              "{$src2, $dst|$dst, $src2}"),
 +                   [], IIC_ALU_NONMEM>, OpSize16, LOCK;
 +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
 +                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
 +                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
 +                   !strconcat(mnemonic, "{l}\t",
 +                              "{$src2, $dst|$dst, $src2}"),
 +                   [], IIC_ALU_NONMEM>, OpSize32, LOCK;
 +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
 +                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
 +                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
 +                    !strconcat(mnemonic, "{q}\t",
 +                               "{$src2, $dst|$dst, $src2}"),
 +                    [], IIC_ALU_NONMEM>, LOCK;
 +
 +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
 +                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
 +                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
 +                    !strconcat(mnemonic, "{b}\t",
 +                               "{$src2, $dst|$dst, $src2}"),
 +                    [], IIC_ALU_MEM>, LOCK;
 +
 +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
 +                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
 +                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
 +                      !strconcat(mnemonic, "{w}\t",
 +                                 "{$src2, $dst|$dst, $src2}"),
 +                      [], IIC_ALU_MEM>, OpSize16, LOCK;
 +
 +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
 +                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
 +                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
 +                      !strconcat(mnemonic, "{l}\t",
 +                                 "{$src2, $dst|$dst, $src2}"),
 +                      [], IIC_ALU_MEM>, OpSize32, LOCK;
 +
 +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
 +                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
 +                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
 +                          !strconcat(mnemonic, "{q}\t",
 +                                     "{$src2, $dst|$dst, $src2}"),
 +                          [], IIC_ALU_MEM>, LOCK;
 +
 +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
 +                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
 +                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
 +                      !strconcat(mnemonic, "{w}\t",
 +                                 "{$src2, $dst|$dst, $src2}"),
 +                      [], IIC_ALU_MEM>, OpSize16, LOCK;
 +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
 +                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
 +                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
 +                      !strconcat(mnemonic, "{l}\t",
 +                                 "{$src2, $dst|$dst, $src2}"),
 +                      [], IIC_ALU_MEM>, OpSize32, LOCK;
 +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
 +                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
 +                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
 +                       !strconcat(mnemonic, "{q}\t",
 +                                  "{$src2, $dst|$dst, $src2}"),
 +                       [], IIC_ALU_MEM>, LOCK;
 +
 +}
 +
 +}
 +
 +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
 +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
 +defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
 +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
 +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 +
 +// Optimized codegen when the non-memory output is not used.
 +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
 +                          string mnemonic> {
 +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
 +    SchedRW = [WriteALULd, WriteRMW] in {
 +
 +def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
 +                 !strconcat(mnemonic, "{b}\t$dst"),
 +                 [], IIC_UNARY_MEM>, LOCK;
 +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
 +                 !strconcat(mnemonic, "{w}\t$dst"),
 +                 [], IIC_UNARY_MEM>, OpSize16, LOCK;
 +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
 +                 !strconcat(mnemonic, "{l}\t$dst"),
 +                 [], IIC_UNARY_MEM>, OpSize32, LOCK;
 +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
 +                  !strconcat(mnemonic, "{q}\t$dst"),
 +                  [], IIC_UNARY_MEM>, LOCK;
 +}
 +}
 +
 +defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
 +defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
 +
 +// Atomic compare and swap.
 +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
 +                         SDPatternOperator frag, X86MemOperand x86memop,
 +                         InstrItinClass itin> {
 +let isCodeGenOnly = 1 in {
 +  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
 +               !strconcat(mnemonic, "\t$ptr"),
 +               [(frag addr:$ptr)], itin>, TB, LOCK;
 +}
 +}
 +
 +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
 +                          string mnemonic, SDPatternOperator frag,
 +                          InstrItinClass itin8, InstrItinClass itin> {
 +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
 +  let Defs = [AL, EFLAGS], Uses = [AL] in
 +  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
 +                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
 +                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
 +  let Defs = [AX, EFLAGS], Uses = [AX] in
 +  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
 +                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
 +                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
 +  let Defs = [EAX, EFLAGS], Uses = [EAX] in
 +  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
 +                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
 +                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
 +  let Defs = [RAX, EFLAGS], Uses = [RAX] in
 +  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
 +                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
 +                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
 +}
 +}
 +
 +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
 +    SchedRW = [WriteALULd, WriteRMW] in {
 +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
 +                                X86cas8, i64mem,
 +                                IIC_CMPX_LOCK_8B>;
 +}
 +
 +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
 +    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
 +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
 +                                 X86cas16, i128mem,
 +                                 IIC_CMPX_LOCK_16B>, REX_W;
 +}
 +
 +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
 +                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
 +
 +// Atomic exchange and add
 +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
 +                             string frag,
 +                             InstrItinClass itin8, InstrItinClass itin> {
 +  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
 +      SchedRW = [WriteALULd, WriteRMW] in {
 +    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
 +                    (ins GR8:$val, i8mem:$ptr),
 +                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
 +                    [(set GR8:$dst,
 +                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
 +                    itin8>;
 +    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
 +                    (ins GR16:$val, i16mem:$ptr),
 +                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
 +                    [(set
 +                       GR16:$dst,
 +                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
 +                    itin>, OpSize16;
 +    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
 +                    (ins GR32:$val, i32mem:$ptr),
 +                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
 +                    [(set
 +                       GR32:$dst,
 +                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
 +                    itin>, OpSize32;
 +    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
 +                     (ins GR64:$val, i64mem:$ptr),
 +                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
 +                     [(set
 +                        GR64:$dst,
 +                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
 +                     itin>;
 +  }
 +}
 +
 +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
 +                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
 +             TB, LOCK;
 +
 +/* The following multiclass tries to make sure that in code like
 + *    x.store (immediate op x.load(acquire), release)
 + * an operation directly on memory is generated instead of wasting a register.
 + * It is not automatic as atomic_store/load are only lowered to MOV instructions
 + * extremely late to prevent them from being accidentally reordered in the backend
 + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
 + */
 +multiclass RELEASE_BINOP_MI<string op> {
 +    def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
 +        "#RELEASE_BINOP PSEUDO!",
 +        [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
 +            (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
 +    // NAME#16 is not generated as 16-bit arithmetic instructions are considered
 +    // costly and avoided as far as possible by this backend anyway
 +    def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
 +        "#RELEASE_BINOP PSEUDO!",
 +        [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
 +            (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
 +    def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
 +        "#RELEASE_BINOP PSEUDO!",
 +        [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
 +            (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
 +}
 +defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
 +defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
 +defm RELEASE_OR  : RELEASE_BINOP_MI<"or">;
 +defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
 +// Note: we don't deal with sub, because substractions of constants are
 +// optimized into additions before this code can run
 +
 +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
 +    def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
 +        "#RELEASE_UNOP PSEUDO!",
 +        [(atomic_store_8 addr:$dst, dag8)]>;
 +    def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
 +        "#RELEASE_UNOP PSEUDO!",
 +        [(atomic_store_16 addr:$dst, dag16)]>;
 +    def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
 +        "#RELEASE_UNOP PSEUDO!",
 +        [(atomic_store_32 addr:$dst, dag32)]>;
 +    def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
 +        "#RELEASE_UNOP PSEUDO!",
 +        [(atomic_store_64 addr:$dst, dag64)]>;
 +}
 +
 +defm RELEASE_INC : RELEASE_UNOP<
 +    (add (atomic_load_8  addr:$dst), (i8 1)),
 +    (add (atomic_load_16 addr:$dst), (i16 1)),
 +    (add (atomic_load_32 addr:$dst), (i32 1)),
 +    (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
 +defm RELEASE_DEC : RELEASE_UNOP<
 +    (add (atomic_load_8  addr:$dst), (i8 -1)),
 +    (add (atomic_load_16 addr:$dst), (i16 -1)),
 +    (add (atomic_load_32 addr:$dst), (i32 -1)),
 +    (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
 +/*
 +TODO: These don't work because the type inference of TableGen fails.
 +TODO: find a way to fix it.
 +defm RELEASE_NEG : RELEASE_UNOP<
 +    (ineg (atomic_load_8  addr:$dst)),
 +    (ineg (atomic_load_16 addr:$dst)),
 +    (ineg (atomic_load_32 addr:$dst)),
 +    (ineg (atomic_load_64 addr:$dst))>;
 +defm RELEASE_NOT : RELEASE_UNOP<
 +    (not (atomic_load_8  addr:$dst)),
 +    (not (atomic_load_16 addr:$dst)),
 +    (not (atomic_load_32 addr:$dst)),
 +    (not (atomic_load_64 addr:$dst))>;
 +*/
 +
 +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
 +			"#RELEASE_MOV PSEUDO !",
 +			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
 +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
 +			"#RELEASE_MOV PSEUDO !",
 +			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
 +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
 +			"#RELEASE_MOV PSEUDO !",
 +			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
 +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
 +			"#RELEASE_MOV PSEUDO !",
 +			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
 +
 +def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
 +                        "#RELEASE_MOV PSEUDO!",
 +                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
 +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
 +                        "#RELEASE_MOV PSEUDO!",
 +                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
 +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
 +                        "#RELEASE_MOV PSEUDO!",
 +                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
 +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
 +                        "#RELEASE_MOV PSEUDO!",
 +                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
 +
 +def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
 +                      "#ACQUIRE_MOV PSEUDO!",
 +                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
 +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
 +                      "#ACQUIRE_MOV PSEUDO!",
 +                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
 +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
 +                      "#ACQUIRE_MOV PSEUDO!",
 +                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
 +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
 +                      "#ACQUIRE_MOV PSEUDO!",
 +                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
 +//===----------------------------------------------------------------------===//
 +// Conditional Move Pseudo Instructions.
 +//===----------------------------------------------------------------------===//
 +
 +// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
 +// instruction selection into a branch sequence.
 +let Uses = [EFLAGS], usesCustomInserter = 1 in {
 +  def CMOV_FR32 : I<0, Pseudo,
 +                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
 +                    "#CMOV_FR32 PSEUDO!",
 +                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
 +                                                  EFLAGS))]>;
 +  def CMOV_FR64 : I<0, Pseudo,
 +                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
 +                    "#CMOV_FR64 PSEUDO!",
 +                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
 +                                                  EFLAGS))]>;
 +  def CMOV_V4F32 : I<0, Pseudo,
 +                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
 +                    "#CMOV_V4F32 PSEUDO!",
 +                    [(set VR128:$dst,
 +                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V2F64 : I<0, Pseudo,
 +                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
 +                    "#CMOV_V2F64 PSEUDO!",
 +                    [(set VR128:$dst,
 +                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V2I64 : I<0, Pseudo,
 +                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
 +                    "#CMOV_V2I64 PSEUDO!",
 +                    [(set VR128:$dst,
 +                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V8F32 : I<0, Pseudo,
 +                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
 +                    "#CMOV_V8F32 PSEUDO!",
 +                    [(set VR256:$dst,
 +                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V4F64 : I<0, Pseudo,
 +                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
 +                    "#CMOV_V4F64 PSEUDO!",
 +                    [(set VR256:$dst,
 +                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V4I64 : I<0, Pseudo,
 +                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
 +                    "#CMOV_V4I64 PSEUDO!",
 +                    [(set VR256:$dst,
 +                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V8I64 : I<0, Pseudo,
 +                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
 +                    "#CMOV_V8I64 PSEUDO!",
 +                    [(set VR512:$dst,
 +                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V8F64 : I<0, Pseudo,
 +                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
 +                    "#CMOV_V8F64 PSEUDO!",
 +                    [(set VR512:$dst,
 +                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +  def CMOV_V16F32 : I<0, Pseudo,
 +                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
 +                    "#CMOV_V16F32 PSEUDO!",
 +                    [(set VR512:$dst,
 +                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
 +                                          EFLAGS)))]>;
 +}
 +
 +
 +//===----------------------------------------------------------------------===//
 +// DAG Pattern Matching Rules
 +//===----------------------------------------------------------------------===//
 +
 +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
 +def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
 +def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
 +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
 +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
 +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
 +def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
 +
 +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
 +          (ADD32ri GR32:$src1, tconstpool:$src2)>;
 +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
 +          (ADD32ri GR32:$src1, tjumptable:$src2)>;
 +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
 +          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
 +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
 +          (ADD32ri GR32:$src1, texternalsym:$src2)>;
 +def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
 +          (ADD32ri GR32:$src1, tblockaddress:$src2)>;
 +
 +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
 +          (MOV32mi addr:$dst, tglobaladdr:$src)>;
 +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
 +          (MOV32mi addr:$dst, texternalsym:$src)>;
 +def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
 +          (MOV32mi addr:$dst, tblockaddress:$src)>;
 +
 +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
 +// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
 +//  'movabs' predicate should handle this sort of thing.
 +def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
 +          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
 +def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
 +          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
 +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
 +          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
 +def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
 +          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
 +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
 +          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
 +
 +// In kernel code model, we can get the address of a label
 +// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
 +// the MOV64ri32 should accept these.
 +def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
 +          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
 +def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
 +          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
 +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
 +          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
 +def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
 +          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
 +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
 +          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
 +
 +// If we have small model and -static mode, it is safe to store global addresses
 +// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
 +// for MOV64mi32 should handle this sort of thing.
 +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
 +          (MOV64mi32 addr:$dst, tconstpool:$src)>,
 +          Requires<[NearData, IsStatic]>;
 +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
 +          (MOV64mi32 addr:$dst, tjumptable:$src)>,
 +          Requires<[NearData, IsStatic]>;
 +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
 +          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
 +          Requires<[NearData, IsStatic]>;
 +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
 +          (MOV64mi32 addr:$dst, texternalsym:$src)>,
 +          Requires<[NearData, IsStatic]>;
 +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
 +          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
 +          Requires<[NearData, IsStatic]>;
 +
 +def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
 +def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>;
 +
 +// Calls
 +
 +// tls has some funny stuff here...
 +// This corresponds to movabs $foo@tpoff, %rax
 +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
 +          (MOV64ri32 tglobaltlsaddr :$dst)>;
 +// This corresponds to add $foo@tpoff, %rax
 +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
 +          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
 +
 +
 +// Direct PC relative function call for small code model. 32-bit displacement
 +// sign extended to 64-bit.
 +def : Pat<(X86call (i64 tglobaladdr:$dst)),
 +          (CALL64pcrel32 tglobaladdr:$dst)>;
 +def : Pat<(X86call (i64 texternalsym:$dst)),
 +          (CALL64pcrel32 texternalsym:$dst)>;
 +
 +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 +// can never use callee-saved registers. That is the purpose of the GR64_TC
 +// register classes.
 +//
 +// The only volatile register that is never used by the calling convention is
 +// %r11. This happens when calling a vararg function with 6 arguments.
 +//
 +// Match an X86tcret that uses less than 7 volatile registers.
 +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
 +                             (X86tcret node:$ptr, node:$off), [{
 +  // X86tcret args: (*chain, ptr, imm, regs..., glue)
 +  unsigned NumRegs = 0;
 +  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
 +    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
 +      return false;
 +  return true;
 +}]>;
 +
 +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
 +          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
 +          Requires<[Not64BitMode]>;
 +
 +// FIXME: This is disabled for 32-bit PIC mode because the global base
 +// register which is part of the address mode may be assigned a
 +// callee-saved register.
 +def : Pat<(X86tcret (load addr:$dst), imm:$off),
 +          (TCRETURNmi addr:$dst, imm:$off)>,
 +          Requires<[Not64BitMode, IsNotPIC]>;
 +
 +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
 +          (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
 +          Requires<[NotLP64]>;
 +
 +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
 +          (TCRETURNdi texternalsym:$dst, imm:$off)>,
 +          Requires<[NotLP64]>;
 +
 +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
 +          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
 +          Requires<[In64BitMode]>;
 +
 +// Don't fold loads into X86tcret requiring more than 6 regs.
 +// There wouldn't be enough scratch registers for base+index.
 +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
 +          (TCRETURNmi64 addr:$dst, imm:$off)>,
 +          Requires<[In64BitMode]>;
 +
 +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
 +          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
 +          Requires<[IsLP64]>;
 +
 +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
 +          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
 +          Requires<[IsLP64]>;
 +
 +// Normal calls, with various flavors of addresses.
 +def : Pat<(X86call (i32 tglobaladdr:$dst)),
 +          (CALLpcrel32 tglobaladdr:$dst)>;
 +def : Pat<(X86call (i32 texternalsym:$dst)),
 +          (CALLpcrel32 texternalsym:$dst)>;
 +def : Pat<(X86call (i32 imm:$dst)),
 +          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
 +
 +// Comparisons.
 +
 +// TEST R,R is smaller than CMP R,0
 +def : Pat<(X86cmp GR8:$src1, 0),
 +          (TEST8rr GR8:$src1, GR8:$src1)>;
 +def : Pat<(X86cmp GR16:$src1, 0),
 +          (TEST16rr GR16:$src1, GR16:$src1)>;
 +def : Pat<(X86cmp GR32:$src1, 0),
 +          (TEST32rr GR32:$src1, GR32:$src1)>;
 +def : Pat<(X86cmp GR64:$src1, 0),
 +          (TEST64rr GR64:$src1, GR64:$src1)>;
 +
 +// Conditional moves with folded loads with operands swapped and conditions
 +// inverted.
 +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
 +                  Instruction Inst64> {
 +  let Predicates = [HasCMov] in {
 +    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
 +              (Inst16 GR16:$src2, addr:$src1)>;
 +    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
 +              (Inst32 GR32:$src2, addr:$src1)>;
 +    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
 +              (Inst64 GR64:$src2, addr:$src1)>;
 +  }
 +}
 +
 +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
 +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
 +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
 +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
 +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
 +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
 +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
 +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
 +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
 +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
 +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
 +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
 +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
 +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
 +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
 +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 +
 +// zextload bool -> zextload byte
 +def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
 +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
 +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
 +def : Pat<(zextloadi64i1 addr:$src),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 +
 +// extload bool -> extload byte
 +// When extloading from 16-bit and smaller memory locations into 64-bit
 +// registers, use zero-extending loads so that the entire 64-bit register is
 +// defined, avoiding partial-register updates.
 +
 +def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
 +def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
 +def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
 +def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
 +def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
 +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
 +
 +// For other extloads, use subregs, since the high contents of the register are
 +// defined after an extload.
 +def : Pat<(extloadi64i1 addr:$src),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 +def : Pat<(extloadi64i8 addr:$src),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 +def : Pat<(extloadi64i16 addr:$src),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
 +def : Pat<(extloadi64i32 addr:$src),
 +          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 +
 +// anyext. Define these to do an explicit zero-extend to
 +// avoid partial-register updates.
 +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
 +                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
 +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 +
 +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
 +def : Pat<(i32 (anyext GR16:$src)),
 +          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
 +
 +def : Pat<(i64 (anyext GR8 :$src)),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
 +def : Pat<(i64 (anyext GR16:$src)),
 +          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
 +def : Pat<(i64 (anyext GR32:$src)),
 +          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 +
 +
 +// Any instruction that defines a 32-bit result leaves the high half of the
 +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
 +// be copying from a truncate. And x86's cmov doesn't do anything if the
 +// condition is false. But any other 32-bit operation will zero-extend
 +// up to 64 bits.
 +def def32 : PatLeaf<(i32 GR32:$src), [{
 +  return N->getOpcode() != ISD::TRUNCATE &&
 +         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
 +         N->getOpcode() != ISD::CopyFromReg &&
 +         N->getOpcode() != ISD::AssertSext &&
 +         N->getOpcode() != X86ISD::CMOV;
 +}]>;
 +
 +// In the case of a 32-bit def that is known to implicitly zero-extend,
 +// we can use a SUBREG_TO_REG.
 +def : Pat<(i64 (zext def32:$src)),
 +          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 +
 +//===----------------------------------------------------------------------===//
 +// Pattern match OR as ADD
 +//===----------------------------------------------------------------------===//
 +
 +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
 +// 3-addressified into an LEA instruction to avoid copies.  However, we also
 +// want to finally emit these instructions as an or at the end of the code
 +// generator to make the generated code easier to read.  To do this, we select
 +// into "disjoint bits" pseudo ops.
 +
 +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
 +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
 +  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
 +    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 +
 +  APInt KnownZero0, KnownOne0;
 +  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
 +  APInt KnownZero1, KnownOne1;
 +  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
 +  return (~KnownZero0 & ~KnownZero1) == 0;
 +}]>;
 +
 +
 +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 +// Try this before the selecting to OR.
 +let AddedComplexity = 5, SchedRW = [WriteALU] in {
 +
 +let isConvertibleToThreeAddress = 1,
 +    Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
 +let isCommutable = 1 in {
 +def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
 +                    "", // orw/addw REG, REG
 +                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
 +def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
 +                    "", // orl/addl REG, REG
 +                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
 +def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
 +                    "", // orq/addq REG, REG
 +                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
 +} // isCommutable
 +
 +// NOTE: These are order specific, we want the ri8 forms to be listed
 +// first so that they are slightly preferred to the ri forms.
 +
 +def ADD16ri8_DB : I<0, Pseudo,
 +                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
 +                    "", // orw/addw REG, imm8
 +                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
 +def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
 +                    "", // orw/addw REG, imm
 +                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
 +
 +def ADD32ri8_DB : I<0, Pseudo,
 +                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
 +                    "", // orl/addl REG, imm8
 +                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
 +def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
 +                    "", // orl/addl REG, imm
 +                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
 +
 +
 +def ADD64ri8_DB : I<0, Pseudo,
 +                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
 +                    "", // orq/addq REG, imm8
 +                    [(set GR64:$dst, (or_is_add GR64:$src1,
 +                                                i64immSExt8:$src2))]>;
 +def ADD64ri32_DB : I<0, Pseudo,
 +                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
 +                      "", // orq/addq REG, imm
 +                      [(set GR64:$dst, (or_is_add GR64:$src1,
 +                                                  i64immSExt32:$src2))]>;
 +}
 +} // AddedComplexity, SchedRW
 +
 +
 +//===----------------------------------------------------------------------===//
 +// Some peepholes
 +//===----------------------------------------------------------------------===//
 +
 +// Odd encoding trick: -128 fits into an 8-bit immediate field while
 +// +128 doesn't, so in this special case use a sub instead of an add.
 +def : Pat<(add GR16:$src1, 128),
 +          (SUB16ri8 GR16:$src1, -128)>;
 +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
 +          (SUB16mi8 addr:$dst, -128)>;
 +
 +def : Pat<(add GR32:$src1, 128),
 +          (SUB32ri8 GR32:$src1, -128)>;
 +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
 +          (SUB32mi8 addr:$dst, -128)>;
 +
 +def : Pat<(add GR64:$src1, 128),
 +          (SUB64ri8 GR64:$src1, -128)>;
 +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
 +          (SUB64mi8 addr:$dst, -128)>;
 +
 +// The same trick applies for 32-bit immediate fields in 64-bit
 +// instructions.
 +def : Pat<(add GR64:$src1, 0x0000000080000000),
 +          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
 +def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
 +          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
 +
 +// To avoid needing to materialize an immediate in a register, use a 32-bit and
 +// with implicit zero-extension instead of a 64-bit and if the immediate has at
 +// least 32 bits of leading zeros. If in addition the last 32 bits can be
 +// represented with a sign extension of a 8 bit constant, use that.
 +
 +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
 +          (SUBREG_TO_REG
 +            (i64 0),
 +            (AND32ri8
 +              (EXTRACT_SUBREG GR64:$src, sub_32bit),
 +              (i32 (GetLo8XForm imm:$imm))),
 +            sub_32bit)>;
 +
 +def : Pat<(and GR64:$src, i64immZExt32:$imm),
 +          (SUBREG_TO_REG
 +            (i64 0),
 +            (AND32ri
 +              (EXTRACT_SUBREG GR64:$src, sub_32bit),
 +              (i32 (GetLo32XForm imm:$imm))),
 +            sub_32bit)>;
 +
 +
 +// r & (2^16-1) ==> movz
 +def : Pat<(and GR32:$src1, 0xffff),
 +          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
 +// r & (2^8-1) ==> movz
 +def : Pat<(and GR32:$src1, 0xff),
 +          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
 +                                                             GR32_ABCD)),
 +                                      sub_8bit))>,
 +      Requires<[Not64BitMode]>;
 +// r & (2^8-1) ==> movz
 +def : Pat<(and GR16:$src1, 0xff),
 +           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
 +            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
 +             sub_16bit)>,
 +      Requires<[Not64BitMode]>;
 +
 +// r & (2^32-1) ==> movz
 +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
 +          (SUBREG_TO_REG (i64 0),
 +                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
 +                         sub_32bit)>;
 +// r & (2^16-1) ==> movz
 +def : Pat<(and GR64:$src, 0xffff),
 +          (SUBREG_TO_REG (i64 0),
 +                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
 +                      sub_32bit)>;
 +// r & (2^8-1) ==> movz
 +def : Pat<(and GR64:$src, 0xff),
 +          (SUBREG_TO_REG (i64 0),
 +                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
 +                         sub_32bit)>;
 +// r & (2^8-1) ==> movz
 +def : Pat<(and GR32:$src1, 0xff),
 +           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
 +      Requires<[In64BitMode]>;
 +// r & (2^8-1) ==> movz
 +def : Pat<(and GR16:$src1, 0xff),
 +           (EXTRACT_SUBREG (MOVZX32rr8 (i8
 +            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
 +      Requires<[In64BitMode]>;
 +
 +
 +// sext_inreg patterns
 +def : Pat<(sext_inreg GR32:$src, i16),
 +          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
 +def : Pat<(sext_inreg GR32:$src, i8),
 +          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
 +                                                             GR32_ABCD)),
 +                                      sub_8bit))>,
 +      Requires<[Not64BitMode]>;
 +
 +def : Pat<(sext_inreg GR16:$src, i8),
 +           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
 +            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
 +             sub_16bit)>,
 +      Requires<[Not64BitMode]>;
 +
 +def : Pat<(sext_inreg GR64:$src, i32),
 +          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
 +def : Pat<(sext_inreg GR64:$src, i16),
 +          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
 +def : Pat<(sext_inreg GR64:$src, i8),
 +          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
 +def : Pat<(sext_inreg GR32:$src, i8),
 +          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(sext_inreg GR16:$src, i8),
 +           (EXTRACT_SUBREG (MOVSX32rr8
 +            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
 +      Requires<[In64BitMode]>;
 +
 +// sext, sext_load, zext, zext_load
 +def: Pat<(i16 (sext GR8:$src)),
 +          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
 +def: Pat<(sextloadi16i8 addr:$src),
 +          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
 +def: Pat<(i16 (zext GR8:$src)),
 +          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
 +def: Pat<(zextloadi16i8 addr:$src),
 +          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
 +
 +// trunc patterns
 +def : Pat<(i16 (trunc GR32:$src)),
 +          (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
 +def : Pat<(i8 (trunc GR32:$src)),
 +          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
 +                          sub_8bit)>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(i8 (trunc GR16:$src)),
 +          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                          sub_8bit)>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(i32 (trunc GR64:$src)),
 +          (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
 +def : Pat<(i16 (trunc GR64:$src)),
 +          (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
 +def : Pat<(i8 (trunc GR64:$src)),
 +          (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
 +def : Pat<(i8 (trunc GR32:$src)),
 +          (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(i8 (trunc GR16:$src)),
 +          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
 +      Requires<[In64BitMode]>;
 +
 +// h-register tricks
 +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
 +          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                          sub_8bit_hi)>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
 +          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
 +                          sub_8bit_hi)>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(srl GR16:$src, (i8 8)),
 +          (EXTRACT_SUBREG
 +            (MOVZX32rr8
 +              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                              sub_8bit_hi)),
 +            sub_16bit)>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
 +          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
 +                                                             GR16_ABCD)),
 +                                      sub_8bit_hi))>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
 +          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
 +                                                             GR16_ABCD)),
 +                                      sub_8bit_hi))>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
 +          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
 +                                                             GR32_ABCD)),
 +                                      sub_8bit_hi))>,
 +      Requires<[Not64BitMode]>;
 +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
 +          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
 +                                                             GR32_ABCD)),
 +                                      sub_8bit_hi))>,
 +      Requires<[Not64BitMode]>;
 +
 +// h-register tricks.
 +// For now, be conservative on x86-64 and use an h-register extract only if the
 +// value is immediately zero-extended or stored, which are somewhat common
 +// cases. This uses a bunch of code to prevent a register requiring a REX prefix
 +// from being allocated in the same instruction as the h register, as there's
 +// currently no way to describe this requirement to the register allocator.
 +
 +// h-register extract and zero-extend.
 +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
 +          (SUBREG_TO_REG
 +            (i64 0),
 +            (MOVZX32_NOREXrr8
 +              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
 +                              sub_8bit_hi)),
 +            sub_32bit)>;
 +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
 +          (MOVZX32_NOREXrr8
 +            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
 +                            sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
 +          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
 +                                                                   GR32_ABCD)),
 +                                             sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(srl GR16:$src, (i8 8)),
 +          (EXTRACT_SUBREG
 +            (MOVZX32_NOREXrr8
 +              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                              sub_8bit_hi)),
 +            sub_16bit)>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
 +          (MOVZX32_NOREXrr8
 +            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                            sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
 +          (MOVZX32_NOREXrr8
 +            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                            sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
 +          (SUBREG_TO_REG
 +            (i64 0),
 +            (MOVZX32_NOREXrr8
 +              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                              sub_8bit_hi)),
 +            sub_32bit)>;
 +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
 +          (SUBREG_TO_REG
 +            (i64 0),
 +            (MOVZX32_NOREXrr8
 +              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                              sub_8bit_hi)),
 +            sub_32bit)>;
 +
 +// h-register extract and store.
 +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
 +          (MOV8mr_NOREX
 +            addr:$dst,
 +            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
 +                            sub_8bit_hi))>;
 +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
 +          (MOV8mr_NOREX
 +            addr:$dst,
 +            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
 +                            sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
 +          (MOV8mr_NOREX
 +            addr:$dst,
 +            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
 +                            sub_8bit_hi))>,
 +      Requires<[In64BitMode]>;
 +
 +
 +// (shl x, 1) ==> (add x, x)
 +// Note that if x is undef (immediate or otherwise), we could theoretically
 +// end up with the two uses of x getting different values, producing a result
 +// where the least significant bit is not 0. However, the probability of this
 +// happening is considered low enough that this is officially not a
 +// "real problem".
 +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
 +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
 +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
 +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
 +
 +// Helper imms that check if a mask doesn't change significant shift bits.
 +def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
 +def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
 +
 +// Shift amount is implicitly masked.
 +multiclass MaskedShiftAmountPats<SDNode frag, string name> {
 +  // (shift x (and y, 31)) ==> (shift x, y)
 +  def : Pat<(frag GR8:$src1, (and CL, immShift32)),
 +            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
 +  def : Pat<(frag GR16:$src1, (and CL, immShift32)),
 +            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
 +  def : Pat<(frag GR32:$src1, (and CL, immShift32)),
 +            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
 +  def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
 +            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
 +  def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
 +            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
 +  def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
 +            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
 +
 +  // (shift x (and y, 63)) ==> (shift x, y)
 +  def : Pat<(frag GR64:$src1, (and CL, immShift64)),
 +            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
 +  def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
 +            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
 +}
 +
 +defm : MaskedShiftAmountPats<shl, "SHL">;
 +defm : MaskedShiftAmountPats<srl, "SHR">;
 +defm : MaskedShiftAmountPats<sra, "SAR">;
 +defm : MaskedShiftAmountPats<rotl, "ROL">;
 +defm : MaskedShiftAmountPats<rotr, "ROR">;
 +
 +// (anyext (setcc_carry)) -> (setcc_carry)
 +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C16r)>;
 +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C32r)>;
 +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
 +          (SETB_C32r)>;
 +
 +
 +
 +
 +//===----------------------------------------------------------------------===//
 +// EFLAGS-defining Patterns
 +//===----------------------------------------------------------------------===//
 +
 +// add reg, reg
 +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
 +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
 +
 +// add reg, mem
 +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
 +          (ADD8rm GR8:$src1, addr:$src2)>;
 +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
 +          (ADD16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
 +          (ADD32rm GR32:$src1, addr:$src2)>;
 +
 +// add reg, imm
 +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
 +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
 +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
 +def : Pat<(add GR16:$src1, i16immSExt8:$src2),
 +          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(add GR32:$src1, i32immSExt8:$src2),
 +          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
 +
 +// sub reg, reg
 +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
 +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
 +
 +// sub reg, mem
 +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
 +          (SUB8rm GR8:$src1, addr:$src2)>;
 +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
 +          (SUB16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
 +          (SUB32rm GR32:$src1, addr:$src2)>;
 +
 +// sub reg, imm
 +def : Pat<(sub GR8:$src1, imm:$src2),
 +          (SUB8ri GR8:$src1, imm:$src2)>;
 +def : Pat<(sub GR16:$src1, imm:$src2),
 +          (SUB16ri GR16:$src1, imm:$src2)>;
 +def : Pat<(sub GR32:$src1, imm:$src2),
 +          (SUB32ri GR32:$src1, imm:$src2)>;
 +def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
 +          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
 +          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 +
 +// sub 0, reg
 +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
 +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 +
 +// mul reg, reg
 +def : Pat<(mul GR16:$src1, GR16:$src2),
 +          (IMUL16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(mul GR32:$src1, GR32:$src2),
 +          (IMUL32rr GR32:$src1, GR32:$src2)>;
 +
 +// mul reg, mem
 +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
 +          (IMUL16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
 +          (IMUL32rm GR32:$src1, addr:$src2)>;
 +
 +// mul reg, imm
 +def : Pat<(mul GR16:$src1, imm:$src2),
 +          (IMUL16rri GR16:$src1, imm:$src2)>;
 +def : Pat<(mul GR32:$src1, imm:$src2),
 +          (IMUL32rri GR32:$src1, imm:$src2)>;
 +def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
 +          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
 +          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
 +
 +// reg = mul mem, imm
 +def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
 +          (IMUL16rmi addr:$src1, imm:$src2)>;
 +def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
 +          (IMUL32rmi addr:$src1, imm:$src2)>;
 +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
 +          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
 +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
 +          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 +
 +// Patterns for nodes that do not produce flags, for instructions that do.
 +
 +// addition
 +def : Pat<(add GR64:$src1, GR64:$src2),
 +          (ADD64rr GR64:$src1, GR64:$src2)>;
 +def : Pat<(add GR64:$src1, i64immSExt8:$src2),
 +          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(add GR64:$src1, i64immSExt32:$src2),
 +          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
 +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
 +          (ADD64rm GR64:$src1, addr:$src2)>;
 +
 +// subtraction
 +def : Pat<(sub GR64:$src1, GR64:$src2),
 +          (SUB64rr GR64:$src1, GR64:$src2)>;
 +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
 +          (SUB64rm GR64:$src1, addr:$src2)>;
 +def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
 +          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
 +          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
 +
 +// Multiply
 +def : Pat<(mul GR64:$src1, GR64:$src2),
 +          (IMUL64rr GR64:$src1, GR64:$src2)>;
 +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
 +          (IMUL64rm GR64:$src1, addr:$src2)>;
 +def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
 +          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
 +          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
 +          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
 +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
 +          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 +
 +// Increment/Decrement reg.
 +// Do not make INC/DEC if it is slow
 +let Predicates = [NotSlowIncDec] in {
 +  def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>;
 +  def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>;
 +  def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>;
 +  def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
 +  def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>;
 +  def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
 +  def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
 +  def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
 +}
 +
 +// or reg/reg.
 +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
 +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
 +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
 +
 +// or reg/mem
 +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
 +          (OR8rm GR8:$src1, addr:$src2)>;
 +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
 +          (OR16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
 +          (OR32rm GR32:$src1, addr:$src2)>;
 +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
 +          (OR64rm GR64:$src1, addr:$src2)>;
 +
 +// or reg/imm
 +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
 +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
 +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
 +def : Pat<(or GR16:$src1, i16immSExt8:$src2),
 +          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(or GR32:$src1, i32immSExt8:$src2),
 +          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 +def : Pat<(or GR64:$src1, i64immSExt8:$src2),
 +          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(or GR64:$src1, i64immSExt32:$src2),
 +          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
 +
 +// xor reg/reg
 +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
 +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
 +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
 +
 +// xor reg/mem
 +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
 +          (XOR8rm GR8:$src1, addr:$src2)>;
 +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
 +          (XOR16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
 +          (XOR32rm GR32:$src1, addr:$src2)>;
 +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
 +          (XOR64rm GR64:$src1, addr:$src2)>;
 +
 +// xor reg/imm
 +def : Pat<(xor GR8:$src1, imm:$src2),
 +          (XOR8ri GR8:$src1, imm:$src2)>;
 +def : Pat<(xor GR16:$src1, imm:$src2),
 +          (XOR16ri GR16:$src1, imm:$src2)>;
 +def : Pat<(xor GR32:$src1, imm:$src2),
 +          (XOR32ri GR32:$src1, imm:$src2)>;
 +def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
 +          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
 +          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 +def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
 +          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
 +          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
 +
 +// and reg/reg
 +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
 +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
 +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
 +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
 +
 +// and reg/mem
 +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
 +          (AND8rm GR8:$src1, addr:$src2)>;
 +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
 +          (AND16rm GR16:$src1, addr:$src2)>;
 +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
 +          (AND32rm GR32:$src1, addr:$src2)>;
 +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
 +          (AND64rm GR64:$src1, addr:$src2)>;
 +
 +// and reg/imm
 +def : Pat<(and GR8:$src1, imm:$src2),
 +          (AND8ri GR8:$src1, imm:$src2)>;
 +def : Pat<(and GR16:$src1, imm:$src2),
 +          (AND16ri GR16:$src1, imm:$src2)>;
 +def : Pat<(and GR32:$src1, imm:$src2),
 +          (AND32ri GR32:$src1, imm:$src2)>;
 +def : Pat<(and GR16:$src1, i16immSExt8:$src2),
 +          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
 +def : Pat<(and GR32:$src1, i32immSExt8:$src2),
 +          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
 +def : Pat<(and GR64:$src1, i64immSExt8:$src2),
 +          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
 +def : Pat<(and GR64:$src1, i64immSExt32:$src2),
 +          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
 +
 +// Bit scan instruction patterns to match explicit zero-undef behavior.
 +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
 +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
 +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
 +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
 +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
 +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 +
 +// When HasMOVBE is enabled it is possible to get a non-legalized
 +// register-register 16 bit bswap. This maps it to a ROL instruction.
 +let Predicates = [HasMOVBE] in {
 + def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 +}
 diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 765417f64a8..c8d5c591ba9 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1804,6 +1804,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,    return false;  } +int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { +  const MachineFunction *MF = MI->getParent()->getParent(); +  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + +  if (MI->getOpcode() == getCallFrameSetupOpcode() || +      MI->getOpcode() == getCallFrameDestroyOpcode()) { +    unsigned StackAlign = TFI->getStackAlignment(); +    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *  +                 StackAlign; + +    SPAdj -= MI->getOperand(1).getImm(); + +    if (MI->getOpcode() == getCallFrameSetupOpcode()) +      return SPAdj; +    else +      return -SPAdj; +  } +   +  // To know whether a call adjusts the stack, we need information  +  // that is bound to the following ADJCALLSTACKUP pseudo. +  // Look for the next ADJCALLSTACKUP that follows the call. +  if (MI->isCall()) { +    const MachineBasicBlock* MBB = MI->getParent(); +    auto I = ++MachineBasicBlock::const_iterator(MI); +    for (auto E = MBB->end(); I != E; ++I) { +      if (I->getOpcode() == getCallFrameDestroyOpcode() || +          I->isCall()) +        break; +    } + +    // If we could not find a frame destroy opcode, then it has already +    // been simplified, so we don't care. +    if (I->getOpcode() != getCallFrameDestroyOpcode()) +      return 0; + +    return -(I->getOperand(1).getImm()); +  } + +  // Currently handle only PUSHes we can reasonably expect to see +  // in call sequences +  switch (MI->getOpcode()) { +  default:  +    return 0; +  case X86::PUSH32i8: +  case X86::PUSH32r: +  case X86::PUSH32rmm: +  case X86::PUSH32rmr: +  case X86::PUSHi32: +    return 4; +  } +} +  /// isFrameOperand - Return true and the FrameIndex if the specified  /// operand and follow operands form a reference to the stack frame.  bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5662e86932c..4d15467f0ca 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -175,6 +175,11 @@ public:    ///    const X86RegisterInfo &getRegisterInfo() const { return RI; } +  /// getSPAdjust - This returns the stack pointer adjustment made by +  /// this instruction. For x86, we need to handle more complex call +  /// sequences involving PUSHes. +  int getSPAdjust(const MachineInstr *MI) const override; +    /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"    /// extension instruction. That is, it's like a copy where it's legal for the    /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index b23a744da68..9fd03a7059c 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {    unsigned ArgumentStackSize;    /// NumLocalDynamics - Number of local-dynamic TLS accesses.    unsigned NumLocalDynamics; +  /// HasPushSequences - Keeps track of whether this function uses sequences +  /// of pushes to pass function parameters. +  bool HasPushSequences;  private:    /// ForwardedMustTailRegParms - A list of virtual and physical registers @@ -97,7 +100,8 @@ public:                               VarArgsGPOffset(0),                               VarArgsFPOffset(0),                               ArgumentStackSize(0), -                             NumLocalDynamics(0) {} +                             NumLocalDynamics(0), +                             HasPushSequences(false) {}    explicit X86MachineFunctionInfo(MachineFunction &MF)      : ForceFramePointer(false), @@ -113,11 +117,15 @@ public:        VarArgsGPOffset(0),        VarArgsFPOffset(0),        ArgumentStackSize(0), -      NumLocalDynamics(0) {} +      NumLocalDynamics(0), +      HasPushSequences(false) {}    bool getForceFramePointer() const { return ForceFramePointer;}    void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } +  bool getHasPushSequences() const { return HasPushSequences; } +  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } +    bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }    void setRestoreBasePointer(const MachineFunction *MF);    int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 09e651cebfb..0fa38f45370 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -468,8 +468,6 @@ void  X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,                                       int SPAdj, unsigned FIOperandNum,                                       RegScavenger *RS) const { -  assert(SPAdj == 0 && "Unexpected"); -    MachineInstr &MI = *II;    MachineFunction &MF = *MI.getParent()->getParent();    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,    } else      FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); +  if (BasePtr == StackPtr) +    FIOffset += SPAdj; +    // The frame index format for stackmaps and patchpoints is different from the    // X86 format. It only has a FI and an offset.    if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index ea37b29cc5e..2376123de7c 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -192,6 +192,7 @@ public:    void addIRPasses() override;    bool addInstSelector() override;    bool addILPOpts() override; +  void addPreRegAlloc() override;    void addPostRegAlloc() override;    void addPreEmitPass() override;  }; @@ -225,6 +226,10 @@ bool X86PassConfig::addILPOpts() {    return true;  } +void X86PassConfig::addPreRegAlloc() { +  addPass(createX86CallFrameOptimization()); +} +  void X86PassConfig::addPostRegAlloc() {    addPass(createX86FloatingPointStackifierPass());  }  | 

