diff options
author | Michael Kuperstein <michael.m.kuperstein@intel.com> | 2015-02-01 16:15:07 +0000 |
---|---|---|
committer | Michael Kuperstein <michael.m.kuperstein@intel.com> | 2015-02-01 16:15:07 +0000 |
commit | e86aa9a8a455f90bf3d6dc567b9e39b2e4510246 (patch) | |
tree | e727b02e6da5596b9d3cce6b6f77155337ae2aa3 /llvm | |
parent | e6698d5305a29a186e035ab737931a139d70bcd2 (diff) | |
download | bcm5719-llvm-e86aa9a8a455f90bf3d6dc567b9e39b2e4510246.tar.gz bcm5719-llvm-e86aa9a8a455f90bf3d6dc567b9e39b2e4510246.zip |
Revert r227728 due to bad line endings.
llvm-svn: 227746
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/include/llvm/Target/TargetFrameLowering.h | 5 | ||||
-rw-r--r-- | llvm/lib/CodeGen/PrologEpilogInserter.cpp | 20 | ||||
-rw-r--r-- | llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/CMakeLists.txt | 107 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86CallFrameOptimization.cpp | 400 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86FastISel.cpp | 6716 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86FrameLowering.cpp | 4123 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86FrameLowering.h | 192 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrCompiler.td | 3700 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 52 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86MachineFunctionInfo.h | 12 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86RegisterInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetMachine.cpp | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/inalloca-invoke.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/movtopush.ll | 178 |
17 files changed, 7486 insertions, 8046 deletions
diff --git a/llvm/include/llvm/Target/TargetFrameLowering.h b/llvm/include/llvm/Target/TargetFrameLowering.h index f17640f71e9..277bd98d371 100644 --- a/llvm/include/llvm/Target/TargetFrameLowering.h +++ b/llvm/include/llvm/Target/TargetFrameLowering.h @@ -193,11 +193,6 @@ public: return hasReservedCallFrame(MF) || hasFP(MF); } - // needsFrameIndexResolution - Do we need to perform FI resolution for - // this function. Normally, this is required only when the function - // has any stack objects. However, targets may want to override this. - virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; - /// getFrameIndexOffset - Returns the displacement from the frame register to /// the stack frame of the specified index. virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 61407faaf32..385e5a35afb 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -703,8 +703,7 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) { /// register references and actual offsets. /// void PEI::replaceFrameIndices(MachineFunction &Fn) { - const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); - if (!TFI.needsFrameIndexResolution(Fn)) return; + if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? // Store SPAdj at exit of a basic block. SmallVector<int, 8> SPState; @@ -770,6 +769,13 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, continue; } + // If we are looking at a call sequence, we need to keep track of + // the SP adjustment made by each instruction in the sequence. + // This includes both the frame setup/destroy pseudos (handled above), + // as well as other instructions that have side effects w.r.t the SP. + if (InsideCallSequence) + SPAdj += TII.getSPAdjust(I); + MachineInstr *MI = I; bool DoIncr = true; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { @@ -848,16 +854,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, break; } - // If we are looking at a call sequence, we need to keep track of - // the SP adjustment made by each instruction in the sequence. - // This includes both the frame setup/destroy pseudos (handled above), - // as well as other instructions that have side effects w.r.t the SP. - // Note that this must come after eliminateFrameIndex, because - // if I itself referred to a frame index, we shouldn't count its own - // adjustment. - if (MI && InsideCallSequence) - SPAdj += TII.getSPAdjust(MI); - if (DoIncr && I != BB->end()) ++I; // Update register states. diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index e3f01912b87..1557d10238e 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -42,8 +42,3 @@ int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, FrameReg = RI->getFrameRegister(MF); return getFrameIndexOffset(MF, FI); } - -bool TargetFrameLowering::needsFrameIndexResolution( - const MachineFunction &MF) const { - return MF.getFrameInfo()->hasStackObjects(); -} diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 461915f3414..1083fad80e8 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -1,54 +1,53 @@ -set(LLVM_TARGET_DEFINITIONS X86.td)
-
-tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
-add_public_tablegen_target(X86CommonTableGen)
-
-set(sources
- X86AsmPrinter.cpp
- X86CallFrameOptimization.cpp
- X86FastISel.cpp
- X86FloatingPoint.cpp
- X86FrameLowering.cpp
- X86ISelDAGToDAG.cpp
- X86ISelLowering.cpp
- X86InstrInfo.cpp
- X86MCInstLower.cpp
- X86MachineFunctionInfo.cpp
- X86PadShortFunction.cpp
- X86RegisterInfo.cpp
- X86SelectionDAGInfo.cpp
- X86Subtarget.cpp
- X86TargetMachine.cpp
- X86TargetObjectFile.cpp
- X86TargetTransformInfo.cpp
- X86VZeroUpper.cpp
- X86FixupLEAs.cpp
- )
-
-if( CMAKE_CL_64 )
- enable_language(ASM_MASM)
- ADD_CUSTOM_COMMAND(
- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
- MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
- COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
- )
- set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
-endif()
-
-add_llvm_target(X86CodeGen ${sources})
-
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
-add_subdirectory(Utils)
+set(LLVM_TARGET_DEFINITIONS X86.td) + +tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info) +tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) +tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) +tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) +tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) +tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) +add_public_tablegen_target(X86CommonTableGen) + +set(sources + X86AsmPrinter.cpp + X86FastISel.cpp + X86FloatingPoint.cpp + X86FrameLowering.cpp + X86ISelDAGToDAG.cpp + X86ISelLowering.cpp + X86InstrInfo.cpp + X86MCInstLower.cpp + X86MachineFunctionInfo.cpp + X86PadShortFunction.cpp + X86RegisterInfo.cpp + X86SelectionDAGInfo.cpp + X86Subtarget.cpp + X86TargetMachine.cpp + X86TargetObjectFile.cpp + X86TargetTransformInfo.cpp + X86VZeroUpper.cpp + X86FixupLEAs.cpp + ) + +if( CMAKE_CL_64 ) + enable_language(ASM_MASM) + ADD_CUSTOM_COMMAND( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj + MAIN_DEPENDENCY X86CompilationCallback_Win64.asm + COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm + ) + set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) +endif() + +add_llvm_target(X86CodeGen ${sources}) + +add_subdirectory(AsmParser) +add_subdirectory(Disassembler) +add_subdirectory(InstPrinter) +add_subdirectory(MCTargetDesc) +add_subdirectory(TargetInfo) +add_subdirectory(Utils) diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 8b0a4cf477f..71fc567cb55 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -64,11 +64,6 @@ FunctionPass *createX86PadShortFunctions(); /// to eliminate execution delays in some Atom processors. FunctionPass *createX86FixupLEAs(); -/// createX86CallFrameOptimization - Return a pass that optimizes -/// the code-size of x86 call sequences. This is done by replacing -/// esp-relative movs with pushes. -FunctionPass *createX86CallFrameOptimization(); - } // End llvm namespace #endif diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp deleted file mode 100644 index f832b94fdc6..00000000000 --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ /dev/null @@ -1,400 +0,0 @@ -//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a pass that optimizes call sequences on x86.
-// Currently, it converts movs of function parameters onto the stack into
-// pushes. This is beneficial for two main reasons:
-// 1) The push instruction encoding is much smaller than an esp-relative mov
-// 2) It is possible to push memory arguments directly. So, if the
-// the transformation is preformed pre-reg-alloc, it can help relieve
-// register pressure.
-//
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-cf-opt"
-
-cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
- cl::desc("Avoid optimizing x86 call frames for size"),
- cl::init(false), cl::Hidden);
-
-namespace {
-class X86CallFrameOptimization : public MachineFunctionPass {
-public:
- X86CallFrameOptimization() : MachineFunctionPass(ID) {}
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
-private:
- bool shouldPerformTransformation(MachineFunction &MF);
-
- bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I);
-
- MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
- unsigned Reg);
-
- const char *getPassName() const override {
- return "X86 Optimize Call Frame";
- }
-
- const TargetInstrInfo *TII;
- const TargetFrameLowering *TFL;
- const MachineRegisterInfo *MRI;
- static char ID;
-};
-
-char X86CallFrameOptimization::ID = 0;
-}
-
-FunctionPass *llvm::createX86CallFrameOptimization() {
- return new X86CallFrameOptimization();
-}
-
-// This checks whether the transformation is legal and profitable
-bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
- if (NoX86CFOpt.getValue())
- return false;
-
- // We currently only support call sequences where *all* parameters.
- // are passed on the stack.
- // No point in running this in 64-bit mode, since some arguments are
- // passed in-register in all common calling conventions, so the pattern
- // we're looking for will never match.
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- if (STI.is64Bit())
- return false;
-
- // You would expect straight-line code between call-frame setup and
- // call-frame destroy. You would be wrong. There are circumstances (e.g.
- // CMOV_GR8 expansion of a select that feeds a function call!) where we can
- // end up with the setup and the destroy in different basic blocks.
- // This is bad, and breaks SP adjustment.
- // So, check that all of the frames in the function are closed inside
- // the same block, and, for good measure, that there are no nested frames.
- int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
- int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
- for (MachineBasicBlock &BB : MF) {
- bool InsideFrameSequence = false;
- for (MachineInstr &MI : BB) {
- if (MI.getOpcode() == FrameSetupOpcode) {
- if (InsideFrameSequence)
- return false;
- InsideFrameSequence = true;
- }
- else if (MI.getOpcode() == FrameDestroyOpcode) {
- if (!InsideFrameSequence)
- return false;
- InsideFrameSequence = false;
- }
- }
-
- if (InsideFrameSequence)
- return false;
- }
-
- // Now that we know the transformation is legal, check if it is
- // profitable.
- // TODO: Add a heuristic that actually looks at the function,
- // and enable this for more cases.
-
- // This transformation is always a win when we expected to have
- // a reserved call frame. Under other circumstances, it may be either
- // a win or a loss, and requires a heuristic.
- // For now, enable it only for the relatively clear win cases.
- bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
- if (CannotReserveFrame)
- return true;
-
- // For now, don't even try to evaluate the profitability when
- // not optimizing for size.
- AttributeSet FnAttrs = MF.getFunction()->getAttributes();
- bool OptForSize =
- FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
- Attribute::OptimizeForSize) ||
- FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-
- if (!OptForSize)
- return false;
-
- // Stack re-alignment can make this unprofitable even in terms of size.
- // As mentioned above, a better heuristic is needed. For now, don't do this
- // when the required alignment is above 8. (4 would be the safe choice, but
- // some experimentation showed 8 is generally good).
- if (TFL->getStackAlignment() > 8)
- return false;
-
- return true;
-}
-
-bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
- TII = MF.getSubtarget().getInstrInfo();
- TFL = MF.getSubtarget().getFrameLowering();
- MRI = &MF.getRegInfo();
-
- if (!shouldPerformTransformation(MF))
- return false;
-
- int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
-
- bool Changed = false;
-
- for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
- for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
- if (I->getOpcode() == FrameSetupOpcode)
- Changed |= adjustCallSequence(MF, *BB, I);
-
- return Changed;
-}
-
-bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
-
- // Check that this particular call sequence is amenable to the
- // transformation.
- const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- unsigned StackPtr = RegInfo.getStackRegister();
- int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
-
- // We expect to enter this at the beginning of a call sequence
- assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
- MachineBasicBlock::iterator FrameSetup = I++;
-
-
- // For globals in PIC mode, we can have some LEAs here.
- // Ignore them, they don't bother us.
- // TODO: Extend this to something that covers more cases.
- while (I->getOpcode() == X86::LEA32r)
- ++I;
-
- // We expect a copy instruction here.
- // TODO: The copy instruction is a lowering artifact.
- // We should also support a copy-less version, where the stack
- // pointer is used directly.
- if (!I->isCopy() || !I->getOperand(0).isReg())
- return false;
- MachineBasicBlock::iterator SPCopy = I++;
- StackPtr = SPCopy->getOperand(0).getReg();
-
- // Scan the call setup sequence for the pattern we're looking for.
- // We only handle a simple case - a sequence of MOV32mi or MOV32mr
- // instructions, that push a sequence of 32-bit values onto the stack, with
- // no gaps between them.
- SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
- unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
- if (MaxAdjust > 4)
- MovVector.resize(MaxAdjust, nullptr);
-
- do {
- int Opcode = I->getOpcode();
- if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
- break;
-
- // We only want movs of the form:
- // movl imm/r32, k(%esp)
- // If we run into something else, bail.
- // Note that AddrBaseReg may, counter to its name, not be a register,
- // but rather a frame index.
- // TODO: Support the fi case. This should probably work now that we
- // have the infrastructure to track the stack pointer within a call
- // sequence.
- if (!I->getOperand(X86::AddrBaseReg).isReg() ||
- (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
- !I->getOperand(X86::AddrScaleAmt).isImm() ||
- (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
- (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
- (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
- !I->getOperand(X86::AddrDisp).isImm())
- return false;
-
- int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
- assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
-
- // We really don't want to consider the unaligned case.
- if (StackDisp % 4)
- return false;
- StackDisp /= 4;
-
- assert((size_t)StackDisp < MovVector.size() &&
- "Function call has more parameters than the stack is adjusted for.");
-
- // If the same stack slot is being filled twice, something's fishy.
- if (MovVector[StackDisp] != nullptr)
- return false;
- MovVector[StackDisp] = I;
-
- ++I;
- } while (I != MBB.end());
-
- // We now expect the end of the sequence - a call and a stack adjust.
- if (I == MBB.end())
- return false;
-
- // For PCrel calls, we expect an additional COPY of the basereg.
- // If we find one, skip it.
- if (I->isCopy()) {
- if (I->getOperand(1).getReg() ==
- MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
- ++I;
- else
- return false;
- }
-
- if (!I->isCall())
- return false;
- MachineBasicBlock::iterator Call = I;
- if ((++I)->getOpcode() != FrameDestroyOpcode)
- return false;
-
- // Now, go through the vector, and see that we don't have any gaps,
- // but only a series of 32-bit MOVs.
-
- int64_t ExpectedDist = 0;
- auto MMI = MovVector.begin(), MME = MovVector.end();
- for (; MMI != MME; ++MMI, ExpectedDist += 4)
- if (*MMI == nullptr)
- break;
-
- // If the call had no parameters, do nothing
- if (!ExpectedDist)
- return false;
-
- // We are either at the last parameter, or a gap.
- // Make sure it's not a gap
- for (; MMI != MME; ++MMI)
- if (*MMI != nullptr)
- return false;
-
- // Ok, we can in fact do the transformation for this call.
- // Do not remove the FrameSetup instruction, but adjust the parameters.
- // PEI will end up finalizing the handling of this.
- FrameSetup->getOperand(1).setImm(ExpectedDist);
-
- DebugLoc DL = I->getDebugLoc();
- // Now, iterate through the vector in reverse order, and replace the movs
- // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
- // replace uses.
- for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
- MachineBasicBlock::iterator MOV = *MovVector[Idx];
- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
- if (MOV->getOpcode() == X86::MOV32mi) {
- unsigned PushOpcode = X86::PUSHi32;
- // If the operand is a small (8-bit) immediate, we can use a
- // PUSH instruction with a shorter encoding.
- // Note that isImm() may fail even though this is a MOVmi, because
- // the operand can also be a symbol.
- if (PushOp.isImm()) {
- int64_t Val = PushOp.getImm();
- if (isInt<8>(Val))
- PushOpcode = X86::PUSH32i8;
- }
- BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
- } else {
- unsigned int Reg = PushOp.getReg();
-
- // If PUSHrmm is not slow on this target, try to fold the source of the
- // push into the instruction.
- const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
- bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
-
- // Check that this is legal to fold. Right now, we're extremely
- // conservative about that.
- MachineInstr *DefMov = nullptr;
- if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
- MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
-
- unsigned NumOps = DefMov->getDesc().getNumOperands();
- for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
- Push->addOperand(DefMov->getOperand(i));
-
- DefMov->eraseFromParent();
- } else {
- BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
- }
- }
-
- MBB.erase(MOV);
- }
-
- // The stack-pointer copy is no longer used in the call sequences.
- // There should not be any other users, but we can't commit to that, so:
- if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
- SPCopy->eraseFromParent();
-
- // Once we've done this, we need to make sure PEI doesn't assume a reserved
- // frame.
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- FuncInfo->setHasPushSequences(true);
-
- return true;
-}
-
-MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
- MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
- // Do an extremely restricted form of load folding.
- // ISel will often create patterns like:
- // movl 4(%edi), %eax
- // movl 8(%edi), %ecx
- // movl 12(%edi), %edx
- // movl %edx, 8(%esp)
- // movl %ecx, 4(%esp)
- // movl %eax, (%esp)
- // call
- // Get rid of those with prejudice.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
- return nullptr;
-
- // Make sure this is the only use of Reg.
- if (!MRI->hasOneNonDBGUse(Reg))
- return nullptr;
-
- MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
-
- // Make sure the def is a MOV from memory.
- // If the def is an another block, give up.
- if (DefMI->getOpcode() != X86::MOV32rm ||
- DefMI->getParent() != FrameSetup->getParent())
- return nullptr;
-
- // Be careful with movs that load from a stack slot, since it may get
- // resolved incorrectly.
- // TODO: Again, we already have the infrastructure, so this should work.
- if (!DefMI->getOperand(1).isReg())
- return nullptr;
-
- // Now, make sure everything else up until the ADJCALLSTACK is a sequence
- // of MOVs. To be less conservative would require duplicating a lot of the
- // logic from PeepholeOptimizer.
- // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
- // to be smarter about folding into pushes.
- for (auto I = DefMI; I != FrameSetup; ++I)
- if (I->getOpcode() != X86::MOV32rm)
- return nullptr;
-
- return DefMI;
-}
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 220ba312197..227cacd24eb 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -1,3358 +1,3358 @@ -//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86-specific support for the FastISel class. Much
-// of the target-specific code is generated by tablegen in the file
-// X86GenFastISel.inc, which is #included here.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86CallingConv.h"
-#include "X86InstrBuilder.h"
-#include "X86InstrInfo.h"
-#include "X86MachineFunctionInfo.h"
-#include "X86RegisterInfo.h"
-#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FastISel.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-namespace {
-
-class X86FastISel final : public FastISel {
- /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
- /// make the right decision when generating code for different targets.
- const X86Subtarget *Subtarget;
-
- /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
- /// floating point ops.
- /// When SSE is available, use it for f32 operations.
- /// When SSE2 is available, use it for f64 operations.
- bool X86ScalarSSEf64;
- bool X86ScalarSSEf32;
-
-public:
- explicit X86FastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo)
- : FastISel(funcInfo, libInfo) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
- X86ScalarSSEf64 = Subtarget->hasSSE2();
- X86ScalarSSEf32 = Subtarget->hasSSE1();
- }
-
- bool fastSelectInstruction(const Instruction *I) override;
-
- /// \brief The specified machine instr operand is a vreg, and that
- /// vreg is being provided by the specified load instruction. If possible,
- /// try to fold the load as an operand to the instruction, returning true if
- /// possible.
- bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
- const LoadInst *LI) override;
-
- bool fastLowerArguments() override;
- bool fastLowerCall(CallLoweringInfo &CLI) override;
- bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
-
-#include "X86GenFastISel.inc"
-
-private:
- bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
-
- bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
- unsigned &ResultReg);
-
- bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
- MachineMemOperand *MMO = nullptr, bool Aligned = false);
- bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
- const X86AddressMode &AM,
- MachineMemOperand *MMO = nullptr, bool Aligned = false);
-
- bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
- unsigned &ResultReg);
-
- bool X86SelectAddress(const Value *V, X86AddressMode &AM);
- bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
-
- bool X86SelectLoad(const Instruction *I);
-
- bool X86SelectStore(const Instruction *I);
-
- bool X86SelectRet(const Instruction *I);
-
- bool X86SelectCmp(const Instruction *I);
-
- bool X86SelectZExt(const Instruction *I);
-
- bool X86SelectBranch(const Instruction *I);
-
- bool X86SelectShift(const Instruction *I);
-
- bool X86SelectDivRem(const Instruction *I);
-
- bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
-
- bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
-
- bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
-
- bool X86SelectSelect(const Instruction *I);
-
- bool X86SelectTrunc(const Instruction *I);
-
- bool X86SelectFPExt(const Instruction *I);
- bool X86SelectFPTrunc(const Instruction *I);
-
- const X86InstrInfo *getInstrInfo() const {
- return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
- }
- const X86TargetMachine *getTargetMachine() const {
- return static_cast<const X86TargetMachine *>(&TM);
- }
-
- bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
-
- unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
- unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
- unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
- unsigned fastMaterializeConstant(const Constant *C) override;
-
- unsigned fastMaterializeAlloca(const AllocaInst *C) override;
-
- unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
-
- /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
- /// computed in an SSE register, not on the X87 floating point stack.
- bool isScalarFPTypeInSSEReg(EVT VT) const {
- return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
- (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
- }
-
- bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
-
- bool IsMemcpySmall(uint64_t Len);
-
- bool TryEmitSmallMemcpy(X86AddressMode DestAM,
- X86AddressMode SrcAM, uint64_t Len);
-
- bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
- const Value *Cond);
-};
-
-} // end anonymous namespace.
-
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
- X86::CondCode CC = X86::COND_INVALID;
- bool NeedSwap = false;
- switch (Predicate) {
- default: break;
- // Floating-point Predicates
- case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
- case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
- case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
- case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
- case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
- case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
- case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
- case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
- case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
- case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
- case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
- case CmpInst::FCMP_OEQ: // fall-through
- case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
- // Integer Predicates
- case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
- case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
- case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
- case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
- case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
- case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
- case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
- case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
- case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
- case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
- }
-
- return std::make_pair(CC, NeedSwap);
-}
-
-static std::pair<unsigned, bool>
-getX86SSEConditionCode(CmpInst::Predicate Predicate) {
- unsigned CC;
- bool NeedSwap = false;
-
- // SSE Condition code mapping:
- // 0 - EQ
- // 1 - LT
- // 2 - LE
- // 3 - UNORD
- // 4 - NEQ
- // 5 - NLT
- // 6 - NLE
- // 7 - ORD
- switch (Predicate) {
- default: llvm_unreachable("Unexpected predicate");
- case CmpInst::FCMP_OEQ: CC = 0; break;
- case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
- case CmpInst::FCMP_OLT: CC = 1; break;
- case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
- case CmpInst::FCMP_OLE: CC = 2; break;
- case CmpInst::FCMP_UNO: CC = 3; break;
- case CmpInst::FCMP_UNE: CC = 4; break;
- case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
- case CmpInst::FCMP_UGE: CC = 5; break;
- case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
- case CmpInst::FCMP_UGT: CC = 6; break;
- case CmpInst::FCMP_ORD: CC = 7; break;
- case CmpInst::FCMP_UEQ:
- case CmpInst::FCMP_ONE: CC = 8; break;
- }
-
- return std::make_pair(CC, NeedSwap);
-}
-
-/// \brief Check if it is possible to fold the condition from the XALU intrinsic
-/// into the user. The condition code will only be updated on success.
-bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
- const Value *Cond) {
- if (!isa<ExtractValueInst>(Cond))
- return false;
-
- const auto *EV = cast<ExtractValueInst>(Cond);
- if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
- return false;
-
- const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
- MVT RetVT;
- const Function *Callee = II->getCalledFunction();
- Type *RetTy =
- cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
- if (!isTypeLegal(RetTy, RetVT))
- return false;
-
- if (RetVT != MVT::i32 && RetVT != MVT::i64)
- return false;
-
- X86::CondCode TmpCC;
- switch (II->getIntrinsicID()) {
- default: return false;
- case Intrinsic::sadd_with_overflow:
- case Intrinsic::ssub_with_overflow:
- case Intrinsic::smul_with_overflow:
- case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
- case Intrinsic::uadd_with_overflow:
- case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
- }
-
- // Check if both instructions are in the same basic block.
- if (II->getParent() != I->getParent())
- return false;
-
- // Make sure nothing is in the way
- BasicBlock::const_iterator Start = I;
- BasicBlock::const_iterator End = II;
- for (auto Itr = std::prev(Start); Itr != End; --Itr) {
- // We only expect extractvalue instructions between the intrinsic and the
- // instruction to be selected.
- if (!isa<ExtractValueInst>(Itr))
- return false;
-
- // Check that the extractvalue operand comes from the intrinsic.
- const auto *EVI = cast<ExtractValueInst>(Itr);
- if (EVI->getAggregateOperand() != II)
- return false;
- }
-
- CC = TmpCC;
- return true;
-}
-
-bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
- EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
- if (evt == MVT::Other || !evt.isSimple())
- // Unhandled type. Halt "fast" selection and bail.
- return false;
-
- VT = evt.getSimpleVT();
- // For now, require SSE/SSE2 for performing floating-point operations,
- // since x87 requires additional work.
- if (VT == MVT::f64 && !X86ScalarSSEf64)
- return false;
- if (VT == MVT::f32 && !X86ScalarSSEf32)
- return false;
- // Similarly, no f80 support yet.
- if (VT == MVT::f80)
- return false;
- // We only handle legal types. For example, on x86-32 the instruction
- // selector contains all of the 64-bit instructions from x86-64,
- // under the assumption that i64 won't be used if the target doesn't
- // support it.
- return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
-}
-
-#include "X86GenCallingConv.inc"
-
-/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
-/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
-/// Return true and the result register by reference if it is possible.
-bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
- MachineMemOperand *MMO, unsigned &ResultReg) {
- // Get opcode and regclass of the output for the given load instruction.
- unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- switch (VT.getSimpleVT().SimpleTy) {
- default: return false;
- case MVT::i1:
- case MVT::i8:
- Opc = X86::MOV8rm;
- RC = &X86::GR8RegClass;
- break;
- case MVT::i16:
- Opc = X86::MOV16rm;
- RC = &X86::GR16RegClass;
- break;
- case MVT::i32:
- Opc = X86::MOV32rm;
- RC = &X86::GR32RegClass;
- break;
- case MVT::i64:
- // Must be in x86-64 mode.
- Opc = X86::MOV64rm;
- RC = &X86::GR64RegClass;
- break;
- case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = &X86::FR32RegClass;
- } else {
- Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
- break;
- case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = &X86::FR64RegClass;
- } else {
- Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
- break;
- case MVT::f80:
- // No f80 support yet.
- return false;
- }
-
- ResultReg = createResultReg(RC);
- MachineInstrBuilder MIB =
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
- addFullAddress(MIB, AM);
- if (MMO)
- MIB->addMemOperand(*FuncInfo.MF, MMO);
- return true;
-}
-
-/// X86FastEmitStore - Emit a machine instruction to store a value Val of
-/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
-/// and a displacement offset, or a GlobalAddress,
-/// i.e. V. Return true if it is possible.
-bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
- const X86AddressMode &AM,
- MachineMemOperand *MMO, bool Aligned) {
- // Get opcode and regclass of the output for the given store instruction.
- unsigned Opc = 0;
- switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f80: // No f80 support yet.
- default: return false;
- case MVT::i1: {
- // Mask out all but lowest bit.
- unsigned AndResult = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::AND8ri), AndResult)
- .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
- ValReg = AndResult;
- }
- // FALLTHROUGH, handling i1 as i8.
- case MVT::i8: Opc = X86::MOV8mr; break;
- case MVT::i16: Opc = X86::MOV16mr; break;
- case MVT::i32: Opc = X86::MOV32mr; break;
- case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
- case MVT::f32:
- Opc = X86ScalarSSEf32 ?
- (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
- break;
- case MVT::f64:
- Opc = X86ScalarSSEf64 ?
- (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
- break;
- case MVT::v4f32:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
- else
- Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
- break;
- case MVT::v2f64:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
- else
- Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
- break;
- case MVT::v4i32:
- case MVT::v2i64:
- case MVT::v8i16:
- case MVT::v16i8:
- if (Aligned)
- Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
- else
- Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
- break;
- }
-
- MachineInstrBuilder MIB =
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
- addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
- if (MMO)
- MIB->addMemOperand(*FuncInfo.MF, MMO);
-
- return true;
-}
-
-bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
- const X86AddressMode &AM,
- MachineMemOperand *MMO, bool Aligned) {
- // Handle 'null' like i32/i64 0.
- if (isa<ConstantPointerNull>(Val))
- Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
-
- // If this is a store of a simple constant, fold the constant into the store.
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
- unsigned Opc = 0;
- bool Signed = true;
- switch (VT.getSimpleVT().SimpleTy) {
- default: break;
- case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
- case MVT::i8: Opc = X86::MOV8mi; break;
- case MVT::i16: Opc = X86::MOV16mi; break;
- case MVT::i32: Opc = X86::MOV32mi; break;
- case MVT::i64:
- // Must be a 32-bit sign extended value.
- if (isInt<32>(CI->getSExtValue()))
- Opc = X86::MOV64mi32;
- break;
- }
-
- if (Opc) {
- MachineInstrBuilder MIB =
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
- addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
- : CI->getZExtValue());
- if (MMO)
- MIB->addMemOperand(*FuncInfo.MF, MMO);
- return true;
- }
- }
-
- unsigned ValReg = getRegForValue(Val);
- if (ValReg == 0)
- return false;
-
- bool ValKill = hasTrivialKill(Val);
- return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
-}
-
-/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
-/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
-/// ISD::SIGN_EXTEND).
-bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
- unsigned Src, EVT SrcVT,
- unsigned &ResultReg) {
- unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
- Src, /*TODO: Kill=*/false);
- if (RR == 0)
- return false;
-
- ResultReg = RR;
- return true;
-}
-
-bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
- // Handle constant address.
- if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
- // Can't handle alternate code models yet.
- if (TM.getCodeModel() != CodeModel::Small)
- return false;
-
- // Can't handle TLS yet.
- if (GV->isThreadLocal())
- return false;
-
- // RIP-relative addresses can't have additional register operands, so if
- // we've already folded stuff into the addressing mode, just force the
- // global value into its own register, which we can use as the basereg.
- if (!Subtarget->isPICStyleRIPRel() ||
- (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
- // Okay, we've committed to selecting this global. Set up the address.
- AM.GV = GV;
-
- // Allow the subtarget to classify the global.
- unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
- // If this reference is relative to the pic base, set it now.
- if (isGlobalRelativeToPICBase(GVFlags)) {
- // FIXME: How do we know Base.Reg is free??
- AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
- }
-
- // Unless the ABI requires an extra load, return a direct reference to
- // the global.
- if (!isGlobalStubReference(GVFlags)) {
- if (Subtarget->isPICStyleRIPRel()) {
- // Use rip-relative addressing if we can. Above we verified that the
- // base and index registers are unused.
- assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
- AM.Base.Reg = X86::RIP;
- }
- AM.GVOpFlags = GVFlags;
- return true;
- }
-
- // Ok, we need to do a load from a stub. If we've already loaded from
- // this stub, reuse the loaded pointer, otherwise emit the load now.
- DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
- unsigned LoadReg;
- if (I != LocalValueMap.end() && I->second != 0) {
- LoadReg = I->second;
- } else {
- // Issue load from stub.
- unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- X86AddressMode StubAM;
- StubAM.Base.Reg = AM.Base.Reg;
- StubAM.GV = GV;
- StubAM.GVOpFlags = GVFlags;
-
- // Prepare for inserting code in the local-value area.
- SavePoint SaveInsertPt = enterLocalValueArea();
-
- if (TLI.getPointerTy() == MVT::i64) {
- Opc = X86::MOV64rm;
- RC = &X86::GR64RegClass;
-
- if (Subtarget->isPICStyleRIPRel())
- StubAM.Base.Reg = X86::RIP;
- } else {
- Opc = X86::MOV32rm;
- RC = &X86::GR32RegClass;
- }
-
- LoadReg = createResultReg(RC);
- MachineInstrBuilder LoadMI =
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
- addFullAddress(LoadMI, StubAM);
-
- // Ok, back to normal mode.
- leaveLocalValueArea(SaveInsertPt);
-
- // Prevent loading GV stub multiple times in same MBB.
- LocalValueMap[V] = LoadReg;
- }
-
- // Now construct the final address. Note that the Disp, Scale,
- // and Index values may already be set here.
- AM.Base.Reg = LoadReg;
- AM.GV = nullptr;
- return true;
- }
- }
-
- // If all else fails, try to materialize the value in a register.
- if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
- if (AM.Base.Reg == 0) {
- AM.Base.Reg = getRegForValue(V);
- return AM.Base.Reg != 0;
- }
- if (AM.IndexReg == 0) {
- assert(AM.Scale == 1 && "Scale with no index!");
- AM.IndexReg = getRegForValue(V);
- return AM.IndexReg != 0;
- }
- }
-
- return false;
-}
-
-/// X86SelectAddress - Attempt to fill in an address from the given value.
-///
-bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
- SmallVector<const Value *, 32> GEPs;
-redo_gep:
- const User *U = nullptr;
- unsigned Opcode = Instruction::UserOp1;
- if (const Instruction *I = dyn_cast<Instruction>(V)) {
- // Don't walk into other basic blocks; it's possible we haven't
- // visited them yet, so the instructions may not yet be assigned
- // virtual registers.
- if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
- FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
- Opcode = I->getOpcode();
- U = I;
- }
- } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
- Opcode = C->getOpcode();
- U = C;
- }
-
- if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
- if (Ty->getAddressSpace() > 255)
- // Fast instruction selection doesn't support the special
- // address spaces.
- return false;
-
- switch (Opcode) {
- default: break;
- case Instruction::BitCast:
- // Look past bitcasts.
- return X86SelectAddress(U->getOperand(0), AM);
-
- case Instruction::IntToPtr:
- // Look past no-op inttoptrs.
- if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
- return X86SelectAddress(U->getOperand(0), AM);
- break;
-
- case Instruction::PtrToInt:
- // Look past no-op ptrtoints.
- if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
- return X86SelectAddress(U->getOperand(0), AM);
- break;
-
- case Instruction::Alloca: {
- // Do static allocas.
- const AllocaInst *A = cast<AllocaInst>(V);
- DenseMap<const AllocaInst *, int>::iterator SI =
- FuncInfo.StaticAllocaMap.find(A);
- if (SI != FuncInfo.StaticAllocaMap.end()) {
- AM.BaseType = X86AddressMode::FrameIndexBase;
- AM.Base.FrameIndex = SI->second;
- return true;
- }
- break;
- }
-
- case Instruction::Add: {
- // Adds of constants are common and easy enough.
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
- uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
- // They have to fit in the 32-bit signed displacement field though.
- if (isInt<32>(Disp)) {
- AM.Disp = (uint32_t)Disp;
- return X86SelectAddress(U->getOperand(0), AM);
- }
- }
- break;
- }
-
- case Instruction::GetElementPtr: {
- X86AddressMode SavedAM = AM;
-
- // Pattern-match simple GEPs.
- uint64_t Disp = (int32_t)AM.Disp;
- unsigned IndexReg = AM.IndexReg;
- unsigned Scale = AM.Scale;
- gep_type_iterator GTI = gep_type_begin(U);
- // Iterate through the indices, folding what we can. Constants can be
- // folded, and one dynamic index can be handled, if the scale is supported.
- for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
- i != e; ++i, ++GTI) {
- const Value *Op = *i;
- if (StructType *STy = dyn_cast<StructType>(*GTI)) {
- const StructLayout *SL = DL.getStructLayout(STy);
- Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
- continue;
- }
-
- // A array/variable index is always of the form i*S where S is the
- // constant scale size. See if we can push the scale into immediates.
- uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
- for (;;) {
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
- // Constant-offset addressing.
- Disp += CI->getSExtValue() * S;
- break;
- }
- if (canFoldAddIntoGEP(U, Op)) {
- // A compatible add with a constant operand. Fold the constant.
- ConstantInt *CI =
- cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
- Disp += CI->getSExtValue() * S;
- // Iterate on the other operand.
- Op = cast<AddOperator>(Op)->getOperand(0);
- continue;
- }
- if (IndexReg == 0 &&
- (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
- (S == 1 || S == 2 || S == 4 || S == 8)) {
- // Scaled-index addressing.
- Scale = S;
- IndexReg = getRegForGEPIndex(Op).first;
- if (IndexReg == 0)
- return false;
- break;
- }
- // Unsupported.
- goto unsupported_gep;
- }
- }
-
- // Check for displacement overflow.
- if (!isInt<32>(Disp))
- break;
-
- AM.IndexReg = IndexReg;
- AM.Scale = Scale;
- AM.Disp = (uint32_t)Disp;
- GEPs.push_back(V);
-
- if (const GetElementPtrInst *GEP =
- dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
- // Ok, the GEP indices were covered by constant-offset and scaled-index
- // addressing. Update the address state and move on to examining the base.
- V = GEP;
- goto redo_gep;
- } else if (X86SelectAddress(U->getOperand(0), AM)) {
- return true;
- }
-
- // If we couldn't merge the gep value into this addr mode, revert back to
- // our address and just match the value instead of completely failing.
- AM = SavedAM;
-
- for (SmallVectorImpl<const Value *>::reverse_iterator
- I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
- if (handleConstantAddresses(*I, AM))
- return true;
-
- return false;
- unsupported_gep:
- // Ok, the GEP indices weren't all covered.
- break;
- }
- }
-
- return handleConstantAddresses(V, AM);
-}
-
-/// X86SelectCallAddress - Attempt to fill in an address from the given value.
-///
-bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
- const User *U = nullptr;
- unsigned Opcode = Instruction::UserOp1;
- const Instruction *I = dyn_cast<Instruction>(V);
- // Record if the value is defined in the same basic block.
- //
- // This information is crucial to know whether or not folding an
- // operand is valid.
- // Indeed, FastISel generates or reuses a virtual register for all
- // operands of all instructions it selects. Obviously, the definition and
- // its uses must use the same virtual register otherwise the produced
- // code is incorrect.
- // Before instruction selection, FunctionLoweringInfo::set sets the virtual
- // registers for values that are alive across basic blocks. This ensures
- // that the values are consistently set between across basic block, even
- // if different instruction selection mechanisms are used (e.g., a mix of
- // SDISel and FastISel).
- // For values local to a basic block, the instruction selection process
- // generates these virtual registers with whatever method is appropriate
- // for its needs. In particular, FastISel and SDISel do not share the way
- // local virtual registers are set.
- // Therefore, this is impossible (or at least unsafe) to share values
- // between basic blocks unless they use the same instruction selection
- // method, which is not guarantee for X86.
- // Moreover, things like hasOneUse could not be used accurately, if we
- // allow to reference values across basic blocks whereas they are not
- // alive across basic blocks initially.
- bool InMBB = true;
- if (I) {
- Opcode = I->getOpcode();
- U = I;
- InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
- } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
- Opcode = C->getOpcode();
- U = C;
- }
-
- switch (Opcode) {
- default: break;
- case Instruction::BitCast:
- // Look past bitcasts if its operand is in the same BB.
- if (InMBB)
- return X86SelectCallAddress(U->getOperand(0), AM);
- break;
-
- case Instruction::IntToPtr:
- // Look past no-op inttoptrs if its operand is in the same BB.
- if (InMBB &&
- TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
- return X86SelectCallAddress(U->getOperand(0), AM);
- break;
-
- case Instruction::PtrToInt:
- // Look past no-op ptrtoints if its operand is in the same BB.
- if (InMBB &&
- TLI.getValueType(U->getType()) == TLI.getPointerTy())
- return X86SelectCallAddress(U->getOperand(0), AM);
- break;
- }
-
- // Handle constant address.
- if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
- // Can't handle alternate code models yet.
- if (TM.getCodeModel() != CodeModel::Small)
- return false;
-
- // RIP-relative addresses can't have additional register operands.
- if (Subtarget->isPICStyleRIPRel() &&
- (AM.Base.Reg != 0 || AM.IndexReg != 0))
- return false;
-
- // Can't handle DLL Import.
- if (GV->hasDLLImportStorageClass())
- return false;
-
- // Can't handle TLS.
- if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
- if (GVar->isThreadLocal())
- return false;
-
- // Okay, we've committed to selecting this global. Set up the basic address.
- AM.GV = GV;
-
- // No ABI requires an extra load for anything other than DLLImport, which
- // we rejected above. Return a direct reference to the global.
- if (Subtarget->isPICStyleRIPRel()) {
- // Use rip-relative addressing if we can. Above we verified that the
- // base and index registers are unused.
- assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
- AM.Base.Reg = X86::RIP;
- } else if (Subtarget->isPICStyleStubPIC()) {
- AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
- } else if (Subtarget->isPICStyleGOT()) {
- AM.GVOpFlags = X86II::MO_GOTOFF;
- }
-
- return true;
- }
-
- // If all else fails, try to materialize the value in a register.
- if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
- if (AM.Base.Reg == 0) {
- AM.Base.Reg = getRegForValue(V);
- return AM.Base.Reg != 0;
- }
- if (AM.IndexReg == 0) {
- assert(AM.Scale == 1 && "Scale with no index!");
- AM.IndexReg = getRegForValue(V);
- return AM.IndexReg != 0;
- }
- }
-
- return false;
-}
-
-
-/// X86SelectStore - Select and emit code to implement store instructions.
-bool X86FastISel::X86SelectStore(const Instruction *I) {
- // Atomic stores need special handling.
- const StoreInst *S = cast<StoreInst>(I);
-
- if (S->isAtomic())
- return false;
-
- const Value *Val = S->getValueOperand();
- const Value *Ptr = S->getPointerOperand();
-
- MVT VT;
- if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
- return false;
-
- unsigned Alignment = S->getAlignment();
- unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
- if (Alignment == 0) // Ensure that codegen never sees alignment 0
- Alignment = ABIAlignment;
- bool Aligned = Alignment >= ABIAlignment;
-
- X86AddressMode AM;
- if (!X86SelectAddress(Ptr, AM))
- return false;
-
- return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
-}
-
-/// X86SelectRet - Select and emit code to implement ret instructions.
-bool X86FastISel::X86SelectRet(const Instruction *I) {
- const ReturnInst *Ret = cast<ReturnInst>(I);
- const Function &F = *I->getParent()->getParent();
- const X86MachineFunctionInfo *X86MFInfo =
- FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
-
- if (!FuncInfo.CanLowerReturn)
- return false;
-
- CallingConv::ID CC = F.getCallingConv();
- if (CC != CallingConv::C &&
- CC != CallingConv::Fast &&
- CC != CallingConv::X86_FastCall &&
- CC != CallingConv::X86_64_SysV)
- return false;
-
- if (Subtarget->isCallingConvWin64(CC))
- return false;
-
- // Don't handle popping bytes on return for now.
- if (X86MFInfo->getBytesToPopOnReturn() != 0)
- return false;
-
- // fastcc with -tailcallopt is intended to provide a guaranteed
- // tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
- return false;
-
- // Let SDISel handle vararg functions.
- if (F.isVarArg())
- return false;
-
- // Build a list of return value registers.
- SmallVector<unsigned, 4> RetRegs;
-
- if (Ret->getNumOperands() > 0) {
- SmallVector<ISD::OutputArg, 4> Outs;
- GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
-
- // Analyze operands of the call, assigning locations to each operand.
- SmallVector<CCValAssign, 16> ValLocs;
- CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
- CCInfo.AnalyzeReturn(Outs, RetCC_X86);
-
- const Value *RV = Ret->getOperand(0);
- unsigned Reg = getRegForValue(RV);
- if (Reg == 0)
- return false;
-
- // Only handle a single return value for now.
- if (ValLocs.size() != 1)
- return false;
-
- CCValAssign &VA = ValLocs[0];
-
- // Don't bother handling odd stuff for now.
- if (VA.getLocInfo() != CCValAssign::Full)
- return false;
- // Only handle register returns for now.
- if (!VA.isRegLoc())
- return false;
-
- // The calling-convention tables for x87 returns don't tell
- // the whole story.
- if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
- return false;
-
- unsigned SrcReg = Reg + VA.getValNo();
- EVT SrcVT = TLI.getValueType(RV->getType());
- EVT DstVT = VA.getValVT();
- // Special handling for extended integers.
- if (SrcVT != DstVT) {
- if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
- return false;
-
- if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
- return false;
-
- assert(DstVT == MVT::i32 && "X86 should always ext to i32");
-
- if (SrcVT == MVT::i1) {
- if (Outs[0].Flags.isSExt())
- return false;
- SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
- SrcVT = MVT::i8;
- }
- unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
- ISD::SIGN_EXTEND;
- SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
- SrcReg, /*TODO: Kill=*/false);
- }
-
- // Make the copy.
- unsigned DstReg = VA.getLocReg();
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
- // Avoid a cross-class copy. This is very unlikely.
- if (!SrcRC->contains(DstReg))
- return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
-
- // Add register to return instruction.
- RetRegs.push_back(VA.getLocReg());
- }
-
- // The x86-64 ABI for returning structs by value requires that we copy
- // the sret argument into %rax for the return. We saved the argument into
- // a virtual register in the entry block, so now we copy the value out
- // and into %rax. We also do the same with %eax for Win32.
- if (F.hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- unsigned Reg = X86MFInfo->getSRetReturnReg();
- assert(Reg &&
- "SRetReturnReg should have been set in LowerFormalArguments()!");
- unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
- RetRegs.push_back(RetReg);
- }
-
- // Now emit the RET.
- MachineInstrBuilder MIB =
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
- for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
- MIB.addReg(RetRegs[i], RegState::Implicit);
- return true;
-}
-
-/// X86SelectLoad - Select and emit code to implement load instructions.
-///
-bool X86FastISel::X86SelectLoad(const Instruction *I) {
- const LoadInst *LI = cast<LoadInst>(I);
-
- // Atomic loads need special handling.
- if (LI->isAtomic())
- return false;
-
- MVT VT;
- if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
- return false;
-
- const Value *Ptr = LI->getPointerOperand();
-
- X86AddressMode AM;
- if (!X86SelectAddress(Ptr, AM))
- return false;
-
- unsigned ResultReg = 0;
- if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
- return false;
-
- updateValueMap(I, ResultReg);
- return true;
-}
-
-static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
- bool HasAVX = Subtarget->hasAVX();
- bool X86ScalarSSEf32 = Subtarget->hasSSE1();
- bool X86ScalarSSEf64 = Subtarget->hasSSE2();
-
- switch (VT.getSimpleVT().SimpleTy) {
- default: return 0;
- case MVT::i8: return X86::CMP8rr;
- case MVT::i16: return X86::CMP16rr;
- case MVT::i32: return X86::CMP32rr;
- case MVT::i64: return X86::CMP64rr;
- case MVT::f32:
- return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
- case MVT::f64:
- return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
- }
-}
-
-/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
-/// of the comparison, return an opcode that works for the compare (e.g.
-/// CMP32ri) otherwise return 0.
-static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
- switch (VT.getSimpleVT().SimpleTy) {
- // Otherwise, we can't fold the immediate into this comparison.
- default: return 0;
- case MVT::i8: return X86::CMP8ri;
- case MVT::i16: return X86::CMP16ri;
- case MVT::i32: return X86::CMP32ri;
- case MVT::i64:
- // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
- // field.
- if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
- return X86::CMP64ri32;
- return 0;
- }
-}
-
-bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
- EVT VT, DebugLoc CurDbgLoc) {
- unsigned Op0Reg = getRegForValue(Op0);
- if (Op0Reg == 0) return false;
-
- // Handle 'null' like i32/i64 0.
- if (isa<ConstantPointerNull>(Op1))
- Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
-
- // We have two options: compare with register or immediate. If the RHS of
- // the compare is an immediate that we can fold into this compare, use
- // CMPri, otherwise use CMPrr.
- if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
- if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
- .addReg(Op0Reg)
- .addImm(Op1C->getSExtValue());
- return true;
- }
- }
-
- unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
- if (CompareOpc == 0) return false;
-
- unsigned Op1Reg = getRegForValue(Op1);
- if (Op1Reg == 0) return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
- .addReg(Op0Reg)
- .addReg(Op1Reg);
-
- return true;
-}
-
-bool X86FastISel::X86SelectCmp(const Instruction *I) {
- const CmpInst *CI = cast<CmpInst>(I);
-
- MVT VT;
- if (!isTypeLegal(I->getOperand(0)->getType(), VT))
- return false;
-
- // Try to optimize or fold the cmp.
- CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
- unsigned ResultReg = 0;
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_FALSE: {
- ResultReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
- ResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
- X86::sub_8bit);
- if (!ResultReg)
- return false;
- break;
- }
- case CmpInst::FCMP_TRUE: {
- ResultReg = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
- ResultReg).addImm(1);
- break;
- }
- }
-
- if (ResultReg) {
- updateValueMap(I, ResultReg);
- return true;
- }
-
- const Value *LHS = CI->getOperand(0);
- const Value *RHS = CI->getOperand(1);
-
- // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
- // We don't have to materialize a zero constant for this case and can just use
- // %x again on the RHS.
- if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
- const auto *RHSC = dyn_cast<ConstantFP>(RHS);
- if (RHSC && RHSC->isNullValue())
- RHS = LHS;
- }
-
- // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
- static unsigned SETFOpcTable[2][3] = {
- { X86::SETEr, X86::SETNPr, X86::AND8rr },
- { X86::SETNEr, X86::SETPr, X86::OR8rr }
- };
- unsigned *SETFOpc = nullptr;
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
- case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
- }
-
- ResultReg = createResultReg(&X86::GR8RegClass);
- if (SETFOpc) {
- if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
- return false;
-
- unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
- unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
- ResultReg).addReg(FlagReg1).addReg(FlagReg2);
- updateValueMap(I, ResultReg);
- return true;
- }
-
- X86::CondCode CC;
- bool SwapArgs;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
- assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
- unsigned Opc = X86::getSETFromCond(CC);
-
- if (SwapArgs)
- std::swap(LHS, RHS);
-
- // Emit a compare of LHS/RHS.
- if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
- return false;
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::X86SelectZExt(const Instruction *I) {
- EVT DstVT = TLI.getValueType(I->getType());
- if (!TLI.isTypeLegal(DstVT))
- return false;
-
- unsigned ResultReg = getRegForValue(I->getOperand(0));
- if (ResultReg == 0)
- return false;
-
- // Handle zero-extension from i1 to i8, which is common.
- MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
- if (SrcVT.SimpleTy == MVT::i1) {
- // Set the high bits to zero.
- ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
- SrcVT = MVT::i8;
-
- if (ResultReg == 0)
- return false;
- }
-
- if (DstVT == MVT::i64) {
- // Handle extension to 64-bits via sub-register shenanigans.
- unsigned MovInst;
-
- switch (SrcVT.SimpleTy) {
- case MVT::i8: MovInst = X86::MOVZX32rr8; break;
- case MVT::i16: MovInst = X86::MOVZX32rr16; break;
- case MVT::i32: MovInst = X86::MOV32rr; break;
- default: llvm_unreachable("Unexpected zext to i64 source type");
- }
-
- unsigned Result32 = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
- .addReg(ResultReg);
-
- ResultReg = createResultReg(&X86::GR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
- ResultReg)
- .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
- } else if (DstVT != MVT::i8) {
- ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
- ResultReg, /*Kill=*/true);
- if (ResultReg == 0)
- return false;
- }
-
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::X86SelectBranch(const Instruction *I) {
- // Unconditional branches are selected by tablegen-generated code.
- // Handle a conditional branch.
- const BranchInst *BI = cast<BranchInst>(I);
- MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
- MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-
- // Fold the common case of a conditional branch with a comparison
- // in the same block (values defined on other blocks may not have
- // initialized registers).
- X86::CondCode CC;
- if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
- if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
- EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
-
- // Try to optimize or fold the cmp.
- CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
- case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
- }
-
- const Value *CmpLHS = CI->getOperand(0);
- const Value *CmpRHS = CI->getOperand(1);
-
- // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
- // 0.0.
- // We don't have to materialize a zero constant for this case and can just
- // use %x again on the RHS.
- if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
- const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
- if (CmpRHSC && CmpRHSC->isNullValue())
- CmpRHS = CmpLHS;
- }
-
- // Try to take advantage of fallthrough opportunities.
- if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
- std::swap(TrueMBB, FalseMBB);
- Predicate = CmpInst::getInversePredicate(Predicate);
- }
-
- // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
- // code check. Instead two branch instructions are required to check all
- // the flags. First we change the predicate to a supported condition code,
- // which will be the first branch. Later one we will emit the second
- // branch.
- bool NeedExtraBranch = false;
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_OEQ:
- std::swap(TrueMBB, FalseMBB); // fall-through
- case CmpInst::FCMP_UNE:
- NeedExtraBranch = true;
- Predicate = CmpInst::FCMP_ONE;
- break;
- }
-
- bool SwapArgs;
- unsigned BranchOpc;
- std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
- assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
-
- BranchOpc = X86::GetCondBranchFromCond(CC);
- if (SwapArgs)
- std::swap(CmpLHS, CmpRHS);
-
- // Emit a compare of the LHS and RHS, setting the flags.
- if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
- return false;
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
-
- // X86 requires a second branch to handle UNE (and OEQ, which is mapped
- // to UNE above).
- if (NeedExtraBranch) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
- .addMBB(TrueMBB);
- }
-
- // Obtain the branch weight and add the TrueBB to the successor list.
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
-
- // Emits an unconditional branch to the FalseBB, obtains the branch
- // weight, and adds it to the successor list.
- fastEmitBranch(FalseMBB, DbgLoc);
-
- return true;
- }
- } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
- // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
- // typically happen for _Bool and C++ bools.
- MVT SourceVT;
- if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
- isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
- unsigned TestOpc = 0;
- switch (SourceVT.SimpleTy) {
- default: break;
- case MVT::i8: TestOpc = X86::TEST8ri; break;
- case MVT::i16: TestOpc = X86::TEST16ri; break;
- case MVT::i32: TestOpc = X86::TEST32ri; break;
- case MVT::i64: TestOpc = X86::TEST64ri32; break;
- }
- if (TestOpc) {
- unsigned OpReg = getRegForValue(TI->getOperand(0));
- if (OpReg == 0) return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
- .addReg(OpReg).addImm(1);
-
- unsigned JmpOpc = X86::JNE_1;
- if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
- std::swap(TrueMBB, FalseMBB);
- JmpOpc = X86::JE_1;
- }
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
- .addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
- return true;
- }
- }
- } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
- // Fake request the condition, otherwise the intrinsic might be completely
- // optimized away.
- unsigned TmpReg = getRegForValue(BI->getCondition());
- if (TmpReg == 0)
- return false;
-
- unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
- .addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
- return true;
- }
-
- // Otherwise do a clumsy setcc and re-test it.
- // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
- // in an explicit cast, so make sure to handle that correctly.
- unsigned OpReg = getRegForValue(BI->getCondition());
- if (OpReg == 0) return false;
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(OpReg).addImm(1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
- .addMBB(TrueMBB);
- fastEmitBranch(FalseMBB, DbgLoc);
- uint32_t BranchWeight = 0;
- if (FuncInfo.BPI)
- BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
- TrueMBB->getBasicBlock());
- FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
- return true;
-}
-
-bool X86FastISel::X86SelectShift(const Instruction *I) {
- unsigned CReg = 0, OpReg = 0;
- const TargetRegisterClass *RC = nullptr;
- if (I->getType()->isIntegerTy(8)) {
- CReg = X86::CL;
- RC = &X86::GR8RegClass;
- switch (I->getOpcode()) {
- case Instruction::LShr: OpReg = X86::SHR8rCL; break;
- case Instruction::AShr: OpReg = X86::SAR8rCL; break;
- case Instruction::Shl: OpReg = X86::SHL8rCL; break;
- default: return false;
- }
- } else if (I->getType()->isIntegerTy(16)) {
- CReg = X86::CX;
- RC = &X86::GR16RegClass;
- switch (I->getOpcode()) {
- case Instruction::LShr: OpReg = X86::SHR16rCL; break;
- case Instruction::AShr: OpReg = X86::SAR16rCL; break;
- case Instruction::Shl: OpReg = X86::SHL16rCL; break;
- default: return false;
- }
- } else if (I->getType()->isIntegerTy(32)) {
- CReg = X86::ECX;
- RC = &X86::GR32RegClass;
- switch (I->getOpcode()) {
- case Instruction::LShr: OpReg = X86::SHR32rCL; break;
- case Instruction::AShr: OpReg = X86::SAR32rCL; break;
- case Instruction::Shl: OpReg = X86::SHL32rCL; break;
- default: return false;
- }
- } else if (I->getType()->isIntegerTy(64)) {
- CReg = X86::RCX;
- RC = &X86::GR64RegClass;
- switch (I->getOpcode()) {
- case Instruction::LShr: OpReg = X86::SHR64rCL; break;
- case Instruction::AShr: OpReg = X86::SAR64rCL; break;
- case Instruction::Shl: OpReg = X86::SHL64rCL; break;
- default: return false;
- }
- } else {
- return false;
- }
-
- MVT VT;
- if (!isTypeLegal(I->getType(), VT))
- return false;
-
- unsigned Op0Reg = getRegForValue(I->getOperand(0));
- if (Op0Reg == 0) return false;
-
- unsigned Op1Reg = getRegForValue(I->getOperand(1));
- if (Op1Reg == 0) return false;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
- CReg).addReg(Op1Reg);
-
- // The shift instruction uses X86::CL. If we defined a super-register
- // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
- if (CReg != X86::CL)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::KILL), X86::CL)
- .addReg(CReg, RegState::Kill);
-
- unsigned ResultReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
- .addReg(Op0Reg);
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::X86SelectDivRem(const Instruction *I) {
- const static unsigned NumTypes = 4; // i8, i16, i32, i64
- const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
- const static bool S = true; // IsSigned
- const static bool U = false; // !IsSigned
- const static unsigned Copy = TargetOpcode::COPY;
- // For the X86 DIV/IDIV instruction, in most cases the dividend
- // (numerator) must be in a specific register pair highreg:lowreg,
- // producing the quotient in lowreg and the remainder in highreg.
- // For most data types, to set up the instruction, the dividend is
- // copied into lowreg, and lowreg is sign-extended or zero-extended
- // into highreg. The exception is i8, where the dividend is defined
- // as a single register rather than a register pair, and we
- // therefore directly sign-extend or zero-extend the dividend into
- // lowreg, instead of copying, and ignore the highreg.
- const static struct DivRemEntry {
- // The following portion depends only on the data type.
- const TargetRegisterClass *RC;
- unsigned LowInReg; // low part of the register pair
- unsigned HighInReg; // high part of the register pair
- // The following portion depends on both the data type and the operation.
- struct DivRemResult {
- unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
- unsigned OpSignExtend; // Opcode for sign-extending lowreg into
- // highreg, or copying a zero into highreg.
- unsigned OpCopy; // Opcode for copying dividend into lowreg, or
- // zero/sign-extending into lowreg for i8.
- unsigned DivRemResultReg; // Register containing the desired result.
- bool IsOpSigned; // Whether to use signed or unsigned form.
- } ResultTable[NumOps];
- } OpTable[NumTypes] = {
- { &X86::GR8RegClass, X86::AX, 0, {
- { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
- { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
- { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
- { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
- }
- }, // i8
- { &X86::GR16RegClass, X86::AX, X86::DX, {
- { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
- { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
- { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
- { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
- }
- }, // i16
- { &X86::GR32RegClass, X86::EAX, X86::EDX, {
- { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
- { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
- { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
- { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
- }
- }, // i32
- { &X86::GR64RegClass, X86::RAX, X86::RDX, {
- { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
- { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
- { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
- { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
- }
- }, // i64
- };
-
- MVT VT;
- if (!isTypeLegal(I->getType(), VT))
- return false;
-
- unsigned TypeIndex, OpIndex;
- switch (VT.SimpleTy) {
- default: return false;
- case MVT::i8: TypeIndex = 0; break;
- case MVT::i16: TypeIndex = 1; break;
- case MVT::i32: TypeIndex = 2; break;
- case MVT::i64: TypeIndex = 3;
- if (!Subtarget->is64Bit())
- return false;
- break;
- }
-
- switch (I->getOpcode()) {
- default: llvm_unreachable("Unexpected div/rem opcode");
- case Instruction::SDiv: OpIndex = 0; break;
- case Instruction::SRem: OpIndex = 1; break;
- case Instruction::UDiv: OpIndex = 2; break;
- case Instruction::URem: OpIndex = 3; break;
- }
-
- const DivRemEntry &TypeEntry = OpTable[TypeIndex];
- const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
- unsigned Op0Reg = getRegForValue(I->getOperand(0));
- if (Op0Reg == 0)
- return false;
- unsigned Op1Reg = getRegForValue(I->getOperand(1));
- if (Op1Reg == 0)
- return false;
-
- // Move op0 into low-order input register.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
- // Zero-extend or sign-extend into high-order input register.
- if (OpEntry.OpSignExtend) {
- if (OpEntry.IsOpSigned)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(OpEntry.OpSignExtend));
- else {
- unsigned Zero32 = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::MOV32r0), Zero32);
-
- // Copy the zero into the appropriate sub/super/identical physical
- // register. Unfortunately the operations needed are not uniform enough
- // to fit neatly into the table above.
- if (VT.SimpleTy == MVT::i16) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Copy), TypeEntry.HighInReg)
- .addReg(Zero32, 0, X86::sub_16bit);
- } else if (VT.SimpleTy == MVT::i32) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Copy), TypeEntry.HighInReg)
- .addReg(Zero32);
- } else if (VT.SimpleTy == MVT::i64) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
- .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
- }
- }
- }
- // Generate the DIV/IDIV instruction.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
- // For i8 remainder, we can't reference AH directly, as we'll end
- // up with bogus copies like %R9B = COPY %AH. Reference AX
- // instead to prevent AH references in a REX instruction.
- //
- // The current assumption of the fast register allocator is that isel
- // won't generate explicit references to the GPR8_NOREX registers. If
- // the allocator and/or the backend get enhanced to be more robust in
- // that regard, this can be, and should be, removed.
- unsigned ResultReg = 0;
- if ((I->getOpcode() == Instruction::SRem ||
- I->getOpcode() == Instruction::URem) &&
- OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
- unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
- unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Copy), SourceSuperReg).addReg(X86::AX);
-
- // Shift AX right by 8 bits instead of using AH.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
- ResultSuperReg).addReg(SourceSuperReg).addImm(8);
-
- // Now reference the 8-bit subreg of the result.
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
- /*Kill=*/true, X86::sub_8bit);
- }
- // Copy the result out of the physreg if we haven't already.
- if (!ResultReg) {
- ResultReg = createResultReg(TypeEntry.RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
- .addReg(OpEntry.DivRemResultReg);
- }
- updateValueMap(I, ResultReg);
-
- return true;
-}
-
-/// \brief Emit a conditional move instruction (if the are supported) to lower
-/// the select.
-bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
- // Check if the subtarget supports these instructions.
- if (!Subtarget->hasCMov())
- return false;
-
- // FIXME: Add support for i8.
- if (RetVT < MVT::i16 || RetVT > MVT::i64)
- return false;
-
- const Value *Cond = I->getOperand(0);
- const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
- bool NeedTest = true;
- X86::CondCode CC = X86::COND_NE;
-
- // Optimize conditions coming from a compare if both instructions are in the
- // same basic block (values defined in other basic blocks may not have
- // initialized registers).
- const auto *CI = dyn_cast<CmpInst>(Cond);
- if (CI && (CI->getParent() == I->getParent())) {
- CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
-
- // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
- static unsigned SETFOpcTable[2][3] = {
- { X86::SETNPr, X86::SETEr , X86::TEST8rr },
- { X86::SETPr, X86::SETNEr, X86::OR8rr }
- };
- unsigned *SETFOpc = nullptr;
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_OEQ:
- SETFOpc = &SETFOpcTable[0][0];
- Predicate = CmpInst::ICMP_NE;
- break;
- case CmpInst::FCMP_UNE:
- SETFOpc = &SETFOpcTable[1][0];
- Predicate = CmpInst::ICMP_NE;
- break;
- }
-
- bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
- assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
-
- const Value *CmpLHS = CI->getOperand(0);
- const Value *CmpRHS = CI->getOperand(1);
- if (NeedSwap)
- std::swap(CmpLHS, CmpRHS);
-
- EVT CmpVT = TLI.getValueType(CmpLHS->getType());
- // Emit a compare of the LHS and RHS, setting the flags.
- if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
- return false;
-
- if (SETFOpc) {
- unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
- unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
- FlagReg1);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
- FlagReg2);
- auto const &II = TII.get(SETFOpc[2]);
- if (II.getNumDefs()) {
- unsigned TmpReg = createResultReg(&X86::GR8RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
- .addReg(FlagReg2).addReg(FlagReg1);
- } else {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
- .addReg(FlagReg2).addReg(FlagReg1);
- }
- }
- NeedTest = false;
- } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
- // Fake request the condition, otherwise the intrinsic might be completely
- // optimized away.
- unsigned TmpReg = getRegForValue(Cond);
- if (TmpReg == 0)
- return false;
-
- NeedTest = false;
- }
-
- if (NeedTest) {
- // Selects operate on i1, however, CondReg is 8 bits width and may contain
- // garbage. Indeed, only the less significant bit is supposed to be
- // accurate. If we read more than the lsb, we may see non-zero values
- // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
- // the select. This is achieved by performing TEST against 1.
- unsigned CondReg = getRegForValue(Cond);
- if (CondReg == 0)
- return false;
- bool CondIsKill = hasTrivialKill(Cond);
-
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
- }
-
- const Value *LHS = I->getOperand(1);
- const Value *RHS = I->getOperand(2);
-
- unsigned RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
- unsigned LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
- if (!LHSReg || !RHSReg)
- return false;
-
- unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
- unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
- LHSReg, LHSIsKill);
- updateValueMap(I, ResultReg);
- return true;
-}
-
-/// \brief Emit SSE instructions to lower the select.
-///
-/// Try to use SSE1/SSE2 instructions to simulate a select without branches.
-/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
-/// SSE instructions are available.
-bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
- // Optimize conditions coming from a compare if both instructions are in the
- // same basic block (values defined in other basic blocks may not have
- // initialized registers).
- const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
- if (!CI || (CI->getParent() != I->getParent()))
- return false;
-
- if (I->getType() != CI->getOperand(0)->getType() ||
- !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
- (Subtarget->hasSSE2() && RetVT == MVT::f64)))
- return false;
-
- const Value *CmpLHS = CI->getOperand(0);
- const Value *CmpRHS = CI->getOperand(1);
- CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
-
- // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
- // We don't have to materialize a zero constant for this case and can just use
- // %x again on the RHS.
- if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
- const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
- if (CmpRHSC && CmpRHSC->isNullValue())
- CmpRHS = CmpLHS;
- }
-
- unsigned CC;
- bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
- if (CC > 7)
- return false;
-
- if (NeedSwap)
- std::swap(CmpLHS, CmpRHS);
-
- static unsigned OpcTable[2][2][4] = {
- { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
- { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } },
- { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr },
- { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } }
- };
-
- bool HasAVX = Subtarget->hasAVX();
- unsigned *Opc = nullptr;
- switch (RetVT.SimpleTy) {
- default: return false;
- case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
- case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
- }
-
- const Value *LHS = I->getOperand(1);
- const Value *RHS = I->getOperand(2);
-
- unsigned LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
- unsigned RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
- unsigned CmpLHSReg = getRegForValue(CmpLHS);
- bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
-
- unsigned CmpRHSReg = getRegForValue(CmpRHS);
- bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
-
- if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
- return false;
-
- const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
- unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
- CmpRHSReg, CmpRHSIsKill, CC);
- unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
- LHSReg, LHSIsKill);
- unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
- RHSReg, RHSIsKill);
- unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
- AndReg, /*IsKill=*/true);
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
- // These are pseudo CMOV instructions and will be later expanded into control-
- // flow.
- unsigned Opc;
- switch (RetVT.SimpleTy) {
- default: return false;
- case MVT::i8: Opc = X86::CMOV_GR8; break;
- case MVT::i16: Opc = X86::CMOV_GR16; break;
- case MVT::i32: Opc = X86::CMOV_GR32; break;
- case MVT::f32: Opc = X86::CMOV_FR32; break;
- case MVT::f64: Opc = X86::CMOV_FR64; break;
- }
-
- const Value *Cond = I->getOperand(0);
- X86::CondCode CC = X86::COND_NE;
-
- // Optimize conditions coming from a compare if both instructions are in the
- // same basic block (values defined in other basic blocks may not have
- // initialized registers).
- const auto *CI = dyn_cast<CmpInst>(Cond);
- if (CI && (CI->getParent() == I->getParent())) {
- bool NeedSwap;
- std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
- if (CC > X86::LAST_VALID_COND)
- return false;
-
- const Value *CmpLHS = CI->getOperand(0);
- const Value *CmpRHS = CI->getOperand(1);
-
- if (NeedSwap)
- std::swap(CmpLHS, CmpRHS);
-
- EVT CmpVT = TLI.getValueType(CmpLHS->getType());
- if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
- return false;
- } else {
- unsigned CondReg = getRegForValue(Cond);
- if (CondReg == 0)
- return false;
- bool CondIsKill = hasTrivialKill(Cond);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
- .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
- }
-
- const Value *LHS = I->getOperand(1);
- const Value *RHS = I->getOperand(2);
-
- unsigned LHSReg = getRegForValue(LHS);
- bool LHSIsKill = hasTrivialKill(LHS);
-
- unsigned RHSReg = getRegForValue(RHS);
- bool RHSIsKill = hasTrivialKill(RHS);
-
- if (!LHSReg || !RHSReg)
- return false;
-
- const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
-
- unsigned ResultReg =
- fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::X86SelectSelect(const Instruction *I) {
- MVT RetVT;
- if (!isTypeLegal(I->getType(), RetVT))
- return false;
-
- // Check if we can fold the select.
- if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
- CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
- const Value *Opnd = nullptr;
- switch (Predicate) {
- default: break;
- case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
- case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
- }
- // No need for a select anymore - this is an unconditional move.
- if (Opnd) {
- unsigned OpReg = getRegForValue(Opnd);
- if (OpReg == 0)
- return false;
- bool OpIsKill = hasTrivialKill(Opnd);
- const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
- unsigned ResultReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(OpReg, getKillRegState(OpIsKill));
- updateValueMap(I, ResultReg);
- return true;
- }
- }
-
- // First try to use real conditional move instructions.
- if (X86FastEmitCMoveSelect(RetVT, I))
- return true;
-
- // Try to use a sequence of SSE instructions to simulate a conditional move.
- if (X86FastEmitSSESelect(RetVT, I))
- return true;
-
- // Fall-back to pseudo conditional move instructions, which will be later
- // converted to control-flow.
- if (X86FastEmitPseudoSelect(RetVT, I))
- return true;
-
- return false;
-}
-
-bool X86FastISel::X86SelectFPExt(const Instruction *I) {
- // fpext from float to double.
- if (X86ScalarSSEf64 &&
- I->getType()->isDoubleTy()) {
- const Value *V = I->getOperand(0);
- if (V->getType()->isFloatTy()) {
- unsigned OpReg = getRegForValue(V);
- if (OpReg == 0) return false;
- unsigned ResultReg = createResultReg(&X86::FR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::CVTSS2SDrr), ResultReg)
- .addReg(OpReg);
- updateValueMap(I, ResultReg);
- return true;
- }
- }
-
- return false;
-}
-
-bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
- if (X86ScalarSSEf64) {
- if (I->getType()->isFloatTy()) {
- const Value *V = I->getOperand(0);
- if (V->getType()->isDoubleTy()) {
- unsigned OpReg = getRegForValue(V);
- if (OpReg == 0) return false;
- unsigned ResultReg = createResultReg(&X86::FR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::CVTSD2SSrr), ResultReg)
- .addReg(OpReg);
- updateValueMap(I, ResultReg);
- return true;
- }
- }
- }
-
- return false;
-}
-
-bool X86FastISel::X86SelectTrunc(const Instruction *I) {
- EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
- EVT DstVT = TLI.getValueType(I->getType());
-
- // This code only handles truncation to byte.
- if (DstVT != MVT::i8 && DstVT != MVT::i1)
- return false;
- if (!TLI.isTypeLegal(SrcVT))
- return false;
-
- unsigned InputReg = getRegForValue(I->getOperand(0));
- if (!InputReg)
- // Unhandled operand. Halt "fast" selection and bail.
- return false;
-
- if (SrcVT == MVT::i8) {
- // Truncate from i8 to i1; no code needed.
- updateValueMap(I, InputReg);
- return true;
- }
-
- if (!Subtarget->is64Bit()) {
- // If we're on x86-32; we can't extract an i8 from a general register.
- // First issue a copy to GR16_ABCD or GR32_ABCD.
- const TargetRegisterClass *CopyRC =
- (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
- unsigned CopyReg = createResultReg(CopyRC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
- InputReg = CopyReg;
- }
-
- // Issue an extract_subreg.
- unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
- InputReg, /*Kill=*/true,
- X86::sub_8bit);
- if (!ResultReg)
- return false;
-
- updateValueMap(I, ResultReg);
- return true;
-}
-
-bool X86FastISel::IsMemcpySmall(uint64_t Len) {
- return Len <= (Subtarget->is64Bit() ? 32 : 16);
-}
-
-bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
- X86AddressMode SrcAM, uint64_t Len) {
-
- // Make sure we don't bloat code by inlining very large memcpy's.
- if (!IsMemcpySmall(Len))
- return false;
-
- bool i64Legal = Subtarget->is64Bit();
-
- // We don't care about alignment here since we just emit integer accesses.
- while (Len) {
- MVT VT;
- if (Len >= 8 && i64Legal)
- VT = MVT::i64;
- else if (Len >= 4)
- VT = MVT::i32;
- else if (Len >= 2)
- VT = MVT::i16;
- else
- VT = MVT::i8;
-
- unsigned Reg;
- bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
- RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
- assert(RV && "Failed to emit load or store??");
-
- unsigned Size = VT.getSizeInBits()/8;
- Len -= Size;
- DestAM.Disp += Size;
- SrcAM.Disp += Size;
- }
-
- return true;
-}
-
-bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
- // FIXME: Handle more intrinsics.
- switch (II->getIntrinsicID()) {
- default: return false;
- case Intrinsic::frameaddress: {
- Type *RetTy = II->getCalledFunction()->getReturnType();
-
- MVT VT;
- if (!isTypeLegal(RetTy, VT))
- return false;
-
- unsigned Opc;
- const TargetRegisterClass *RC = nullptr;
-
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Invalid result type for frameaddress.");
- case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
- case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
- }
-
- // This needs to be set before we call getPtrSizedFrameRegister, otherwise
- // we get the wrong frame register.
- MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
- MFI->setFrameAddressIsTaken(true);
-
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF));
- assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
- (FrameReg == X86::EBP && VT == MVT::i32)) &&
- "Invalid Frame Register!");
-
- // Always make a copy of the frame register to to a vreg first, so that we
- // never directly reference the frame register (the TwoAddressInstruction-
- // Pass doesn't like that).
- unsigned SrcReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
-
- // Now recursively load from the frame address.
- // movq (%rbp), %rax
- // movq (%rax), %rax
- // movq (%rax), %rax
- // ...
- unsigned DestReg;
- unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
- while (Depth--) {
- DestReg = createResultReg(RC);
- addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), DestReg), SrcReg);
- SrcReg = DestReg;
- }
-
- updateValueMap(II, SrcReg);
- return true;
- }
- case Intrinsic::memcpy: {
- const MemCpyInst *MCI = cast<MemCpyInst>(II);
- // Don't handle volatile or variable length memcpys.
- if (MCI->isVolatile())
- return false;
-
- if (isa<ConstantInt>(MCI->getLength())) {
- // Small memcpy's are common enough that we want to do them
- // without a call if possible.
- uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
- if (IsMemcpySmall(Len)) {
- X86AddressMode DestAM, SrcAM;
- if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
- !X86SelectAddress(MCI->getRawSource(), SrcAM))
- return false;
- TryEmitSmallMemcpy(DestAM, SrcAM, Len);
- return true;
- }
- }
-
- unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
- if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
- return false;
-
- if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
- return false;
-
- return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
- }
- case Intrinsic::memset: {
- const MemSetInst *MSI = cast<MemSetInst>(II);
-
- if (MSI->isVolatile())
- return false;
-
- unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
- if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
- return false;
-
- if (MSI->getDestAddressSpace() > 255)
- return false;
-
- return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
- }
- case Intrinsic::stackprotector: {
- // Emit code to store the stack guard onto the stack.
- EVT PtrTy = TLI.getPointerTy();
-
- const Value *Op1 = II->getArgOperand(0); // The guard's value.
- const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
-
- MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
-
- // Grab the frame index.
- X86AddressMode AM;
- if (!X86SelectAddress(Slot, AM)) return false;
- if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
- return true;
- }
- case Intrinsic::dbg_declare: {
- const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
- X86AddressMode AM;
- assert(DI->getAddress() && "Null address should be checked earlier!");
- if (!X86SelectAddress(DI->getAddress(), AM))
- return false;
- const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
- // FIXME may need to add RegState::Debug to any registers produced,
- // although ESP/EBP should be the only ones at the moment.
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
- .addImm(0)
- .addMetadata(DI->getVariable())
- .addMetadata(DI->getExpression());
- return true;
- }
- case Intrinsic::trap: {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
- return true;
- }
- case Intrinsic::sqrt: {
- if (!Subtarget->hasSSE1())
- return false;
-
- Type *RetTy = II->getCalledFunction()->getReturnType();
-
- MVT VT;
- if (!isTypeLegal(RetTy, VT))
- return false;
-
- // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
- // is not generated by FastISel yet.
- // FIXME: Update this code once tablegen can handle it.
- static const unsigned SqrtOpc[2][2] = {
- {X86::SQRTSSr, X86::VSQRTSSr},
- {X86::SQRTSDr, X86::VSQRTSDr}
- };
- bool HasAVX = Subtarget->hasAVX();
- unsigned Opc;
- const TargetRegisterClass *RC;
- switch (VT.SimpleTy) {
- default: return false;
- case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
- case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
- }
-
- const Value *SrcVal = II->getArgOperand(0);
- unsigned SrcReg = getRegForValue(SrcVal);
-
- if (SrcReg == 0)
- return false;
-
- unsigned ImplicitDefReg = 0;
- if (HasAVX) {
- ImplicitDefReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
- }
-
- unsigned ResultReg = createResultReg(RC);
- MachineInstrBuilder MIB;
- MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
- ResultReg);
-
- if (ImplicitDefReg)
- MIB.addReg(ImplicitDefReg);
-
- MIB.addReg(SrcReg);
-
- updateValueMap(II, ResultReg);
- return true;
- }
- case Intrinsic::sadd_with_overflow:
- case Intrinsic::uadd_with_overflow:
- case Intrinsic::ssub_with_overflow:
- case Intrinsic::usub_with_overflow:
- case Intrinsic::smul_with_overflow:
- case Intrinsic::umul_with_overflow: {
- // This implements the basic lowering of the xalu with overflow intrinsics
- // into add/sub/mul followed by either seto or setb.
- const Function *Callee = II->getCalledFunction();
- auto *Ty = cast<StructType>(Callee->getReturnType());
- Type *RetTy = Ty->getTypeAtIndex(0U);
- Type *CondTy = Ty->getTypeAtIndex(1);
-
- MVT VT;
- if (!isTypeLegal(RetTy, VT))
- return false;
-
- if (VT < MVT::i8 || VT > MVT::i64)
- return false;
-
- const Value *LHS = II->getArgOperand(0);
- const Value *RHS = II->getArgOperand(1);
-
- // Canonicalize immediate to the RHS.
- if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
- isCommutativeIntrinsic(II))
- std::swap(LHS, RHS);
-
- bool UseIncDec = false;
- if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
- UseIncDec = true;
-
- unsigned BaseOpc, CondOpc;
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Unexpected intrinsic!");
- case Intrinsic::sadd_with_overflow:
- BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
- CondOpc = X86::SETOr;
- break;
- case Intrinsic::uadd_with_overflow:
- BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
- case Intrinsic::ssub_with_overflow:
- BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
- CondOpc = X86::SETOr;
- break;
- case Intrinsic::usub_with_overflow:
- BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
- case Intrinsic::smul_with_overflow:
- BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
- case Intrinsic::umul_with_overflow:
- BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
- }
-
- unsigned LHSReg = getRegForValue(LHS);
- if (LHSReg == 0)
- return false;
- bool LHSIsKill = hasTrivialKill(LHS);
-
- unsigned ResultReg = 0;
- // Check if we have an immediate version.
- if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
- static const unsigned Opc[2][4] = {
- { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
- { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
- };
-
- if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
- ResultReg = createResultReg(TLI.getRegClassFor(VT));
- bool IsDec = BaseOpc == X86ISD::DEC;
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
- .addReg(LHSReg, getKillRegState(LHSIsKill));
- } else
- ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
- CI->getZExtValue());
- }
-
- unsigned RHSReg;
- bool RHSIsKill;
- if (!ResultReg) {
- RHSReg = getRegForValue(RHS);
- if (RHSReg == 0)
- return false;
- RHSIsKill = hasTrivialKill(RHS);
- ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
- RHSIsKill);
- }
-
- // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
- // it manually.
- if (BaseOpc == X86ISD::UMUL && !ResultReg) {
- static const unsigned MULOpc[] =
- { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
- static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
- // First copy the first operand into RAX, which is an implicit input to
- // the X86::MUL*r instruction.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
- .addReg(LHSReg, getKillRegState(LHSIsKill));
- ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
- TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
- } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
- static const unsigned MULOpc[] =
- { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
- if (VT == MVT::i8) {
- // Copy the first operand into AL, which is an implicit input to the
- // X86::IMUL8r instruction.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), X86::AL)
- .addReg(LHSReg, getKillRegState(LHSIsKill));
- ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
- RHSIsKill);
- } else
- ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
- TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
- RHSReg, RHSIsKill);
- }
-
- if (!ResultReg)
- return false;
-
- unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
- assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
- ResultReg2);
-
- updateValueMap(II, ResultReg, 2);
- return true;
- }
- case Intrinsic::x86_sse_cvttss2si:
- case Intrinsic::x86_sse_cvttss2si64:
- case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse2_cvttsd2si64: {
- bool IsInputDouble;
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("Unexpected intrinsic.");
- case Intrinsic::x86_sse_cvttss2si:
- case Intrinsic::x86_sse_cvttss2si64:
- if (!Subtarget->hasSSE1())
- return false;
- IsInputDouble = false;
- break;
- case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse2_cvttsd2si64:
- if (!Subtarget->hasSSE2())
- return false;
- IsInputDouble = true;
- break;
- }
-
- Type *RetTy = II->getCalledFunction()->getReturnType();
- MVT VT;
- if (!isTypeLegal(RetTy, VT))
- return false;
-
- static const unsigned CvtOpc[2][2][2] = {
- { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
- { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
- { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
- { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
- };
- bool HasAVX = Subtarget->hasAVX();
- unsigned Opc;
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected result type.");
- case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
- case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
- }
-
- // Check if we can fold insertelement instructions into the convert.
- const Value *Op = II->getArgOperand(0);
- while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
- const Value *Index = IE->getOperand(2);
- if (!isa<ConstantInt>(Index))
- break;
- unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
-
- if (Idx == 0) {
- Op = IE->getOperand(1);
- break;
- }
- Op = IE->getOperand(0);
- }
-
- unsigned Reg = getRegForValue(Op);
- if (Reg == 0)
- return false;
-
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(Reg);
-
- updateValueMap(II, ResultReg);
- return true;
- }
- }
-}
-
-bool X86FastISel::fastLowerArguments() {
- if (!FuncInfo.CanLowerReturn)
- return false;
-
- const Function *F = FuncInfo.Fn;
- if (F->isVarArg())
- return false;
-
- CallingConv::ID CC = F->getCallingConv();
- if (CC != CallingConv::C)
- return false;
-
- if (Subtarget->isCallingConvWin64(CC))
- return false;
-
- if (!Subtarget->is64Bit())
- return false;
-
- // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
- unsigned GPRCnt = 0;
- unsigned FPRCnt = 0;
- unsigned Idx = 0;
- for (auto const &Arg : F->args()) {
- // The first argument is at index 1.
- ++Idx;
- if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
- F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
- F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
- F->getAttributes().hasAttribute(Idx, Attribute::Nest))
- return false;
-
- Type *ArgTy = Arg.getType();
- if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
- return false;
-
- EVT ArgVT = TLI.getValueType(ArgTy);
- if (!ArgVT.isSimple()) return false;
- switch (ArgVT.getSimpleVT().SimpleTy) {
- default: return false;
- case MVT::i32:
- case MVT::i64:
- ++GPRCnt;
- break;
- case MVT::f32:
- case MVT::f64:
- if (!Subtarget->hasSSE1())
- return false;
- ++FPRCnt;
- break;
- }
-
- if (GPRCnt > 6)
- return false;
-
- if (FPRCnt > 8)
- return false;
- }
-
- static const MCPhysReg GPR32ArgRegs[] = {
- X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
- };
- static const MCPhysReg GPR64ArgRegs[] = {
- X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
- };
- static const MCPhysReg XMMArgRegs[] = {
- X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
- X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
- };
-
- unsigned GPRIdx = 0;
- unsigned FPRIdx = 0;
- for (auto const &Arg : F->args()) {
- MVT VT = TLI.getSimpleValueType(Arg.getType());
- const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
- unsigned SrcReg;
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected value type.");
- case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
- case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
- case MVT::f32: // fall-through
- case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
- }
- unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
- // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
- // Without this, EmitLiveInCopies may eliminate the livein if its only
- // use is a bitcast (which isn't turned into an instruction).
- unsigned ResultReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(DstReg, getKillRegState(true));
- updateValueMap(&Arg, ResultReg);
- }
- return true;
-}
-
-static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
- CallingConv::ID CC,
- ImmutableCallSite *CS) {
- if (Subtarget->is64Bit())
- return 0;
- if (Subtarget->getTargetTriple().isOSMSVCRT())
- return 0;
- if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE)
- return 0;
- if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
- return 0;
- if (CS && CS->paramHasAttr(1, Attribute::InReg))
- return 0;
- return 4;
-}
-
-bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
- auto &OutVals = CLI.OutVals;
- auto &OutFlags = CLI.OutFlags;
- auto &OutRegs = CLI.OutRegs;
- auto &Ins = CLI.Ins;
- auto &InRegs = CLI.InRegs;
- CallingConv::ID CC = CLI.CallConv;
- bool &IsTailCall = CLI.IsTailCall;
- bool IsVarArg = CLI.IsVarArg;
- const Value *Callee = CLI.Callee;
- const char *SymName = CLI.SymName;
-
- bool Is64Bit = Subtarget->is64Bit();
- bool IsWin64 = Subtarget->isCallingConvWin64(CC);
-
- // Handle only C, fastcc, and webkit_js calling conventions for now.
- switch (CC) {
- default: return false;
- case CallingConv::C:
- case CallingConv::Fast:
- case CallingConv::WebKit_JS:
- case CallingConv::X86_FastCall:
- case CallingConv::X86_64_Win64:
- case CallingConv::X86_64_SysV:
- break;
- }
-
- // Allow SelectionDAG isel to handle tail calls.
- if (IsTailCall)
- return false;
-
- // fastcc with -tailcallopt is intended to provide a guaranteed
- // tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
- return false;
-
- // Don't know how to handle Win64 varargs yet. Nothing special needed for
- // x86-32. Special handling for x86-64 is implemented.
- if (IsVarArg && IsWin64)
- return false;
-
- // Don't know about inalloca yet.
- if (CLI.CS && CLI.CS->hasInAllocaArgument())
- return false;
-
- // Fast-isel doesn't know about callee-pop yet.
- if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
- TM.Options.GuaranteedTailCallOpt))
- return false;
-
- SmallVector<MVT, 16> OutVTs;
- SmallVector<unsigned, 16> ArgRegs;
-
- // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
- // instruction. This is safe because it is common to all FastISel supported
- // calling conventions on x86.
- for (int i = 0, e = OutVals.size(); i != e; ++i) {
- Value *&Val = OutVals[i];
- ISD::ArgFlagsTy Flags = OutFlags[i];
- if (auto *CI = dyn_cast<ConstantInt>(Val)) {
- if (CI->getBitWidth() < 32) {
- if (Flags.isSExt())
- Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
- else
- Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
- }
- }
-
- // Passing bools around ends up doing a trunc to i1 and passing it.
- // Codegen this as an argument + "and 1".
- MVT VT;
- auto *TI = dyn_cast<TruncInst>(Val);
- unsigned ResultReg;
- if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
- (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
- TI->hasOneUse()) {
- Value *PrevVal = TI->getOperand(0);
- ResultReg = getRegForValue(PrevVal);
-
- if (!ResultReg)
- return false;
-
- if (!isTypeLegal(PrevVal->getType(), VT))
- return false;
-
- ResultReg =
- fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
- } else {
- if (!isTypeLegal(Val->getType(), VT))
- return false;
- ResultReg = getRegForValue(Val);
- }
-
- if (!ResultReg)
- return false;
-
- ArgRegs.push_back(ResultReg);
- OutVTs.push_back(VT);
- }
-
- // Analyze operands of the call, assigning locations to each operand.
- SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
-
- // Allocate shadow area for Win64
- if (IsWin64)
- CCInfo.AllocateStack(32, 8);
-
- CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
-
- // Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
-
- // Issue CALLSEQ_START
- unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes).addImm(0);
-
- // Walk the register/memloc assignments, inserting copies/loads.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
- CCValAssign const &VA = ArgLocs[i];
- const Value *ArgVal = OutVals[VA.getValNo()];
- MVT ArgVT = OutVTs[VA.getValNo()];
-
- if (ArgVT == MVT::x86mmx)
- return false;
-
- unsigned ArgReg = ArgRegs[VA.getValNo()];
-
- // Promote the value if needed.
- switch (VA.getLocInfo()) {
- case CCValAssign::Full: break;
- case CCValAssign::SExt: {
- assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
- "Unexpected extend");
- bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
- ArgVT, ArgReg);
- assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
- ArgVT = VA.getLocVT();
- break;
- }
- case CCValAssign::ZExt: {
- assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
- "Unexpected extend");
- bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
- ArgVT, ArgReg);
- assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
- ArgVT = VA.getLocVT();
- break;
- }
- case CCValAssign::AExt: {
- assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
- "Unexpected extend");
- bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
- ArgVT, ArgReg);
- if (!Emitted)
- Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
- ArgVT, ArgReg);
- if (!Emitted)
- Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
- ArgVT, ArgReg);
-
- assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
- ArgVT = VA.getLocVT();
- break;
- }
- case CCValAssign::BCvt: {
- ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
- /*TODO: Kill=*/false);
- assert(ArgReg && "Failed to emit a bitcast!");
- ArgVT = VA.getLocVT();
- break;
- }
- case CCValAssign::VExt:
- // VExt has not been implemented, so this should be impossible to reach
- // for now. However, fallback to Selection DAG isel once implemented.
- return false;
- case CCValAssign::AExtUpper:
- case CCValAssign::SExtUpper:
- case CCValAssign::ZExtUpper:
- case CCValAssign::FPExt:
- llvm_unreachable("Unexpected loc info!");
- case CCValAssign::Indirect:
- // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
- // support this.
- return false;
- }
-
- if (VA.isRegLoc()) {
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
- OutRegs.push_back(VA.getLocReg());
- } else {
- assert(VA.isMemLoc());
-
- // Don't emit stores for undef values.
- if (isa<UndefValue>(ArgVal))
- continue;
-
- unsigned LocMemOffset = VA.getLocMemOffset();
- X86AddressMode AM;
- AM.Base.Reg = RegInfo->getStackRegister();
- AM.Disp = LocMemOffset;
- ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
- unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
- MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
- ArgVT.getStoreSize(), Alignment);
- if (Flags.isByVal()) {
- X86AddressMode SrcAM;
- SrcAM.Base.Reg = ArgReg;
- if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
- return false;
- } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
- // If this is a really simple value, emit this with the Value* version
- // of X86FastEmitStore. If it isn't simple, we don't want to do this,
- // as it can cause us to reevaluate the argument.
- if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
- return false;
- } else {
- bool ValIsKill = hasTrivialKill(ArgVal);
- if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
- return false;
- }
- }
- }
-
- // ELF / PIC requires GOT in the EBX register before function calls via PLT
- // GOT pointer.
- if (Subtarget->isPICStyleGOT()) {
- unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
- }
-
- if (Is64Bit && IsVarArg && !IsWin64) {
- // From AMD64 ABI document:
- // For calls that may call functions that use varargs or stdargs
- // (prototype-less calls or calls to functions containing ellipsis (...) in
- // the declaration) %al is used as hidden argument to specify the number
- // of SSE registers used. The contents of %al do not need to match exactly
- // the number of registers, but must be an ubound on the number of SSE
- // registers used and is in the range 0 - 8 inclusive.
-
- // Count the number of XMM registers allocated.
- static const MCPhysReg XMMArgRegs[] = {
- X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
- X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
- };
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
- assert((Subtarget->hasSSE1() || !NumXMMRegs)
- && "SSE registers cannot be used when SSE is disabled");
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
- X86::AL).addImm(NumXMMRegs);
- }
-
- // Materialize callee address in a register. FIXME: GV address can be
- // handled with a CALLpcrel32 instead.
- X86AddressMode CalleeAM;
- if (!X86SelectCallAddress(Callee, CalleeAM))
- return false;
-
- unsigned CalleeOp = 0;
- const GlobalValue *GV = nullptr;
- if (CalleeAM.GV != nullptr) {
- GV = CalleeAM.GV;
- } else if (CalleeAM.Base.Reg != 0) {
- CalleeOp = CalleeAM.Base.Reg;
- } else
- return false;
-
- // Issue the call.
- MachineInstrBuilder MIB;
- if (CalleeOp) {
- // Register-indirect call.
- unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
- MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
- .addReg(CalleeOp);
- } else {
- // Direct call.
- assert(GV && "Not a direct call");
- unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-
- // See if we need any target-specific flags on the GV operand.
- unsigned char OpFlags = 0;
-
- // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
- // external symbols most go through the PLT in PIC mode. If the symbol
- // has hidden or protected visibility, or if it is static or local, then
- // we don't need to use the PLT - we can directly call it.
- if (Subtarget->isTargetELF() &&
- TM.getRelocationModel() == Reloc::PIC_ &&
- GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
- OpFlags = X86II::MO_PLT;
- } else if (Subtarget->isPICStyleStubAny() &&
- (GV->isDeclaration() || GV->isWeakForLinker()) &&
- (!Subtarget->getTargetTriple().isMacOSX() ||
- Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
- // PC-relative references to external symbols should go through $stub,
- // unless we're building with the leopard linker or later, which
- // automatically synthesizes these stubs.
- OpFlags = X86II::MO_DARWIN_STUB;
- }
-
- MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
- if (SymName)
- MIB.addExternalSymbol(SymName, OpFlags);
- else
- MIB.addGlobalAddress(GV, 0, OpFlags);
- }
-
- // Add a register mask operand representing the call-preserved registers.
- // Proper defs for return values will be added by setPhysRegsDeadExcept().
- MIB.addRegMask(TRI.getCallPreservedMask(CC));
-
- // Add an implicit use GOT pointer in EBX.
- if (Subtarget->isPICStyleGOT())
- MIB.addReg(X86::EBX, RegState::Implicit);
-
- if (Is64Bit && IsVarArg && !IsWin64)
- MIB.addReg(X86::AL, RegState::Implicit);
-
- // Add implicit physical register uses to the call.
- for (auto Reg : OutRegs)
- MIB.addReg(Reg, RegState::Implicit);
-
- // Issue CALLSEQ_END
- unsigned NumBytesForCalleeToPop =
- computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
- unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
- .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
-
- // Now handle call return values.
- SmallVector<CCValAssign, 16> RVLocs;
- CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
- CLI.RetTy->getContext());
- CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
-
- // Copy all of the result registers out of their specified physreg.
- unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
- CCValAssign &VA = RVLocs[i];
- EVT CopyVT = VA.getValVT();
- unsigned CopyReg = ResultReg + i;
-
- // If this is x86-64, and we disabled SSE, we can't return FP values
- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
- ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
- report_fatal_error("SSE register return with SSE disabled");
- }
-
- // If we prefer to use the value in xmm registers, copy it out as f80 and
- // use a truncate to move it from fp stack reg to xmm reg.
- if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
- isScalarFPTypeInSSEReg(VA.getValVT())) {
- CopyVT = MVT::f80;
- CopyReg = createResultReg(&X86::RFP80RegClass);
- }
-
- // Copy out the result.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
- InRegs.push_back(VA.getLocReg());
-
- // Round the f80 to the right size, which also moves it to the appropriate
- // xmm register. This is accomplished by storing the f80 value in memory
- // and then loading it back.
- if (CopyVT != VA.getValVT()) {
- EVT ResVT = VA.getValVT();
- unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
- unsigned MemSize = ResVT.getSizeInBits()/8;
- int FI = MFI.CreateStackObject(MemSize, MemSize, false);
- addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc)), FI)
- .addReg(CopyReg);
- Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
- addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg + i), FI);
- }
- }
-
- CLI.ResultReg = ResultReg;
- CLI.NumResultRegs = RVLocs.size();
- CLI.Call = MIB;
-
- return true;
-}
-
-bool
-X86FastISel::fastSelectInstruction(const Instruction *I) {
- switch (I->getOpcode()) {
- default: break;
- case Instruction::Load:
- return X86SelectLoad(I);
- case Instruction::Store:
- return X86SelectStore(I);
- case Instruction::Ret:
- return X86SelectRet(I);
- case Instruction::ICmp:
- case Instruction::FCmp:
- return X86SelectCmp(I);
- case Instruction::ZExt:
- return X86SelectZExt(I);
- case Instruction::Br:
- return X86SelectBranch(I);
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::Shl:
- return X86SelectShift(I);
- case Instruction::SDiv:
- case Instruction::UDiv:
- case Instruction::SRem:
- case Instruction::URem:
- return X86SelectDivRem(I);
- case Instruction::Select:
- return X86SelectSelect(I);
- case Instruction::Trunc:
- return X86SelectTrunc(I);
- case Instruction::FPExt:
- return X86SelectFPExt(I);
- case Instruction::FPTrunc:
- return X86SelectFPTrunc(I);
- case Instruction::IntToPtr: // Deliberate fall-through.
- case Instruction::PtrToInt: {
- EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
- EVT DstVT = TLI.getValueType(I->getType());
- if (DstVT.bitsGT(SrcVT))
- return X86SelectZExt(I);
- if (DstVT.bitsLT(SrcVT))
- return X86SelectTrunc(I);
- unsigned Reg = getRegForValue(I->getOperand(0));
- if (Reg == 0) return false;
- updateValueMap(I, Reg);
- return true;
- }
- }
-
- return false;
-}
-
-unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
- if (VT > MVT::i64)
- return 0;
-
- uint64_t Imm = CI->getZExtValue();
- if (Imm == 0) {
- unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected value type");
- case MVT::i1:
- case MVT::i8:
- return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
- X86::sub_8bit);
- case MVT::i16:
- return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
- X86::sub_16bit);
- case MVT::i32:
- return SrcReg;
- case MVT::i64: {
- unsigned ResultReg = createResultReg(&X86::GR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
- .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
- return ResultReg;
- }
- }
- }
-
- unsigned Opc = 0;
- switch (VT.SimpleTy) {
- default: llvm_unreachable("Unexpected value type");
- case MVT::i1: VT = MVT::i8; // fall-through
- case MVT::i8: Opc = X86::MOV8ri; break;
- case MVT::i16: Opc = X86::MOV16ri; break;
- case MVT::i32: Opc = X86::MOV32ri; break;
- case MVT::i64: {
- if (isUInt<32>(Imm))
- Opc = X86::MOV32ri;
- else if (isInt<32>(Imm))
- Opc = X86::MOV64ri32;
- else
- Opc = X86::MOV64ri;
- break;
- }
- }
- if (VT == MVT::i64 && Opc == X86::MOV32ri) {
- unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
- unsigned ResultReg = createResultReg(&X86::GR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
- .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
- return ResultReg;
- }
- return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
-}
-
-unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
- if (CFP->isNullValue())
- return fastMaterializeFloatZero(CFP);
-
- // Can't handle alternate code models yet.
- CodeModel::Model CM = TM.getCodeModel();
- if (CM != CodeModel::Small && CM != CodeModel::Large)
- return 0;
-
- // Get opcode and regclass of the output for the given load instruction.
- unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- switch (VT.SimpleTy) {
- default: return 0;
- case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = &X86::FR32RegClass;
- } else {
- Opc = X86::LD_Fp32m;
- RC = &X86::RFP32RegClass;
- }
- break;
- case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = &X86::FR64RegClass;
- } else {
- Opc = X86::LD_Fp64m;
- RC = &X86::RFP64RegClass;
- }
- break;
- case MVT::f80:
- // No f80 support yet.
- return 0;
- }
-
- // MachineConstantPool wants an explicit alignment.
- unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
- if (Align == 0) {
- // Alignment of vector types. FIXME!
- Align = DL.getTypeAllocSize(CFP->getType());
- }
-
- // x86-32 PIC requires a PIC base register for constant pools.
- unsigned PICBase = 0;
- unsigned char OpFlag = 0;
- if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
- OpFlag = X86II::MO_PIC_BASE_OFFSET;
- PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
- } else if (Subtarget->isPICStyleGOT()) {
- OpFlag = X86II::MO_GOTOFF;
- PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
- } else if (Subtarget->isPICStyleRIPRel() &&
- TM.getCodeModel() == CodeModel::Small) {
- PICBase = X86::RIP;
- }
-
- // Create the load from the constant pool.
- unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
- unsigned ResultReg = createResultReg(RC);
-
- if (CM == CodeModel::Large) {
- unsigned AddrReg = createResultReg(&X86::GR64RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
- AddrReg)
- .addConstantPoolIndex(CPI, 0, OpFlag);
- MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg);
- addDirectMem(MIB, AddrReg);
- MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
- TM.getDataLayout()->getPointerSize(), Align);
- MIB->addMemOperand(*FuncInfo.MF, MMO);
- return ResultReg;
- }
-
- addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg),
- CPI, PICBase, OpFlag);
- return ResultReg;
-}
-
-unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
- // Can't handle alternate code models yet.
- if (TM.getCodeModel() != CodeModel::Small)
- return 0;
-
- // Materialize addresses with LEA/MOV instructions.
- X86AddressMode AM;
- if (X86SelectAddress(GV, AM)) {
- // If the expression is just a basereg, then we're done, otherwise we need
- // to emit an LEA.
- if (AM.BaseType == X86AddressMode::RegBase &&
- AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
- return AM.Base.Reg;
-
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
- if (TM.getRelocationModel() == Reloc::Static &&
- TLI.getPointerTy() == MVT::i64) {
- // The displacement code could be more than 32 bits away so we need to use
- // an instruction with a 64 bit immediate
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
- ResultReg)
- .addGlobalAddress(GV);
- } else {
- unsigned Opc = TLI.getPointerTy() == MVT::i32
- ? (Subtarget->isTarget64BitILP32()
- ? X86::LEA64_32r : X86::LEA32r)
- : X86::LEA64r;
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg), AM);
- }
- return ResultReg;
- }
- return 0;
-}
-
-unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
- EVT CEVT = TLI.getValueType(C->getType(), true);
-
- // Only handle simple types.
- if (!CEVT.isSimple())
- return 0;
- MVT VT = CEVT.getSimpleVT();
-
- if (const auto *CI = dyn_cast<ConstantInt>(C))
- return X86MaterializeInt(CI, VT);
- else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
- return X86MaterializeFP(CFP, VT);
- else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
- return X86MaterializeGV(GV, VT);
-
- return 0;
-}
-
-unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
- // Fail on dynamic allocas. At this point, getRegForValue has already
- // checked its CSE maps, so if we're here trying to handle a dynamic
- // alloca, we're not going to succeed. X86SelectAddress has a
- // check for dynamic allocas, because it's called directly from
- // various places, but targetMaterializeAlloca also needs a check
- // in order to avoid recursion between getRegForValue,
- // X86SelectAddrss, and targetMaterializeAlloca.
- if (!FuncInfo.StaticAllocaMap.count(C))
- return 0;
- assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
-
- X86AddressMode AM;
- if (!X86SelectAddress(C, AM))
- return 0;
- unsigned Opc = TLI.getPointerTy() == MVT::i32
- ? (Subtarget->isTarget64BitILP32()
- ? X86::LEA64_32r : X86::LEA32r)
- : X86::LEA64r;
- const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
- unsigned ResultReg = createResultReg(RC);
- addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Opc), ResultReg), AM);
- return ResultReg;
-}
-
-unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
- MVT VT;
- if (!isTypeLegal(CF->getType(), VT))
- return 0;
-
- // Get opcode and regclass for the given zero.
- unsigned Opc = 0;
- const TargetRegisterClass *RC = nullptr;
- switch (VT.SimpleTy) {
- default: return 0;
- case MVT::f32:
- if (X86ScalarSSEf32) {
- Opc = X86::FsFLD0SS;
- RC = &X86::FR32RegClass;
- } else {
- Opc = X86::LD_Fp032;
- RC = &X86::RFP32RegClass;
- }
- break;
- case MVT::f64:
- if (X86ScalarSSEf64) {
- Opc = X86::FsFLD0SD;
- RC = &X86::FR64RegClass;
- } else {
- Opc = X86::LD_Fp064;
- RC = &X86::RFP64RegClass;
- }
- break;
- case MVT::f80:
- // No f80 support yet.
- return 0;
- }
-
- unsigned ResultReg = createResultReg(RC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
- return ResultReg;
-}
-
-
-bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
- const LoadInst *LI) {
- const Value *Ptr = LI->getPointerOperand();
- X86AddressMode AM;
- if (!X86SelectAddress(Ptr, AM))
- return false;
-
- const X86InstrInfo &XII = (const X86InstrInfo &)TII;
-
- unsigned Size = DL.getTypeAllocSize(LI->getType());
- unsigned Alignment = LI->getAlignment();
-
- if (Alignment == 0) // Ensure that codegen never sees alignment 0
- Alignment = DL.getABITypeAlignment(LI->getType());
-
- SmallVector<MachineOperand, 8> AddrOps;
- AM.getFullAddress(AddrOps);
-
- MachineInstr *Result =
- XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
- Size, Alignment, /*AllowCommute=*/true);
- if (!Result)
- return false;
-
- Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
- FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
- MI->eraseFromParent();
- return true;
-}
-
-
-namespace llvm {
- FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo) {
- return new X86FastISel(funcInfo, libInfo);
- }
-}
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-specific support for the FastISel class. Much +// of the target-specific code is generated by tablegen in the file +// X86GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86CallingConv.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + +class X86FastISel final : public FastISel { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf64; + bool X86ScalarSSEf32; + +public: + explicit X86FastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) + : FastISel(funcInfo, libInfo) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); + } + + bool fastSelectInstruction(const Instruction *I) override; + + /// \brief The specified machine instr operand is a vreg, and that + /// vreg is being provided by the specified load instruction. If possible, + /// try to fold the load as an operand to the instruction, returning true if + /// possible. + bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) override; + + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; + +#include "X86GenFastISel.inc" + +private: + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); + + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg); + + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); + + bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + + bool X86SelectAddress(const Value *V, X86AddressMode &AM); + bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); + + bool X86SelectLoad(const Instruction *I); + + bool X86SelectStore(const Instruction *I); + + bool X86SelectRet(const Instruction *I); + + bool X86SelectCmp(const Instruction *I); + + bool X86SelectZExt(const Instruction *I); + + bool X86SelectBranch(const Instruction *I); + + bool X86SelectShift(const Instruction *I); + + bool X86SelectDivRem(const Instruction *I); + + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + + bool X86SelectSelect(const Instruction *I); + + bool X86SelectTrunc(const Instruction *I); + + bool X86SelectFPExt(const Instruction *I); + bool X86SelectFPTrunc(const Instruction *I); + + const X86InstrInfo *getInstrInfo() const { + return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); + } + const X86TargetMachine *getTargetMachine() const { + return static_cast<const X86TargetMachine *>(&TM); + } + + bool handleConstantAddresses(const Value *V, X86AddressMode &AM); + + unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); + unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); + unsigned fastMaterializeConstant(const Constant *C) override; + + unsigned fastMaterializeAlloca(const AllocaInst *C) override; + + unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); +}; + +} // end anonymous namespace. + +static std::pair<X86::CondCode, bool> +getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + +static std::pair<unsigned, bool> +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + +bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { + EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); + if (evt == MVT::Other || !evt.isSimple()) + // Unhandled type. Halt "fast" selection and bail. + return false; + + VT = evt.getSimpleVT(); + // For now, require SSE/SSE2 for performing floating-point operations, + // since x87 requires additional work. + if (VT == MVT::f64 && !X86ScalarSSEf64) + return false; + if (VT == MVT::f32 && !X86ScalarSSEf32) + return false; + // Similarly, no f80 support yet. + if (VT == MVT::f80) + return false; + // We only handle legal types. For example, on x86-32 the instruction + // selector contains all of the 64-bit instructions from x86-64, + // under the assumption that i64 won't be used if the target doesn't + // support it. + return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); +} + +#include "X86GenCallingConv.inc" + +/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. +/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. +/// Return true and the result register by reference if it is possible. +bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, + MachineMemOperand *MMO, unsigned &ResultReg) { + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + Opc = X86::MOV8rm; + RC = &X86::GR8RegClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = &X86::GR16RegClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + addFullAddress(MIB, AM); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; +} + +/// X86FastEmitStore - Emit a machine instruction to store a value Val of +/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr +/// and a displacement offset, or a GlobalAddress, +/// i.e. V. Return true if it is possible. +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { + // Get opcode and regclass of the output for the given store instruction. + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f80: // No f80 support yet. + default: return false; + case MVT::i1: { + // Mask out all but lowest bit. + unsigned AndResult = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::AND8ri), AndResult) + .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); + ValReg = AndResult; + } + // FALLTHROUGH, handling i1 as i8. + case MVT::i8: Opc = X86::MOV8mr; break; + case MVT::i16: Opc = X86::MOV16mr; break; + case MVT::i32: Opc = X86::MOV32mr; break; + case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::f32: + Opc = X86ScalarSSEf32 ? + (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + break; + case MVT::f64: + Opc = X86ScalarSSEf64 ? + (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + break; + case MVT::v4f32: + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + break; + case MVT::v2f64: + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Aligned) + Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; + break; + } + + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + + return true; +} + +bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, + const X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Val)) + Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); + + // If this is a store of a simple constant, fold the constant into the store. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + unsigned Opc = 0; + bool Signed = true; + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i8: Opc = X86::MOV8mi; break; + case MVT::i16: Opc = X86::MOV16mi; break; + case MVT::i32: Opc = X86::MOV32mi; break; + case MVT::i64: + // Must be a 32-bit sign extended value. + if (isInt<32>(CI->getSExtValue())) + Opc = X86::MOV64mi32; + break; + } + + if (Opc) { + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() + : CI->getZExtValue()); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; + } + } + + unsigned ValReg = getRegForValue(Val); + if (ValReg == 0) + return false; + + bool ValKill = hasTrivialKill(Val); + return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); +} + +/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of +/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. +/// ISD::SIGN_EXTEND). +bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, + unsigned Src, EVT SrcVT, + unsigned &ResultReg) { + unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + if (RR == 0) + return false; + + ResultReg = RR; + return true; +} + +bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // Can't handle TLS yet. + if (GV->isThreadLocal()) + return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } + + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; + } + + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; + } else { + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + } + + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); + + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); + + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } + + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = nullptr; + return true; + } + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + +/// X86SelectAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { + SmallVector<const Value *, 32> GEPs; +redo_gep: + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Don't walk into other basic blocks; it's possible we haven't + // visited them yet, so the instructions may not yet be assigned + // virtual registers. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::Alloca: { + // Do static allocas. + const AllocaInst *A = cast<AllocaInst>(V); + DenseMap<const AllocaInst *, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(A); + if (SI != FuncInfo.StaticAllocaMap.end()) { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = SI->second; + return true; + } + break; + } + + case Instruction::Add: { + // Adds of constants are common and easy enough. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { + uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); + // They have to fit in the 32-bit signed displacement field though. + if (isInt<32>(Disp)) { + AM.Disp = (uint32_t)Disp; + return X86SelectAddress(U->getOperand(0), AM); + } + } + break; + } + + case Instruction::GetElementPtr: { + X86AddressMode SavedAM = AM; + + // Pattern-match simple GEPs. + uint64_t Disp = (int32_t)AM.Disp; + unsigned IndexReg = AM.IndexReg; + unsigned Scale = AM.Scale; + gep_type_iterator GTI = gep_type_begin(U); + // Iterate through the indices, folding what we can. Constants can be + // folded, and one dynamic index can be handled, if the scale is supported. + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; + } + // Unsupported. + goto unsupported_gep; + } + } + + // Check for displacement overflow. + if (!isInt<32>(Disp)) + break; + + AM.IndexReg = IndexReg; + AM.Scale = Scale; + AM.Disp = (uint32_t)Disp; + GEPs.push_back(V); + + if (const GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(U->getOperand(0))) { + // Ok, the GEP indices were covered by constant-offset and scaled-index + // addressing. Update the address state and move on to examining the base. + V = GEP; + goto redo_gep; + } else if (X86SelectAddress(U->getOperand(0), AM)) { + return true; + } + + // If we couldn't merge the gep value into this addr mode, revert back to + // our address and just match the value instead of completely failing. + AM = SavedAM; + + for (SmallVectorImpl<const Value *>::reverse_iterator + I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I) + if (handleConstantAddresses(*I, AM)) + return true; + + return false; + unsupported_gep: + // Ok, the GEP indices weren't all covered. + break; + } + } + + return handleConstantAddresses(V, AM); +} + +/// X86SelectCallAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + const Instruction *I = dyn_cast<Instruction>(V); + // Record if the value is defined in the same basic block. + // + // This information is crucial to know whether or not folding an + // operand is valid. + // Indeed, FastISel generates or reuses a virtual register for all + // operands of all instructions it selects. Obviously, the definition and + // its uses must use the same virtual register otherwise the produced + // code is incorrect. + // Before instruction selection, FunctionLoweringInfo::set sets the virtual + // registers for values that are alive across basic blocks. This ensures + // that the values are consistently set between across basic block, even + // if different instruction selection mechanisms are used (e.g., a mix of + // SDISel and FastISel). + // For values local to a basic block, the instruction selection process + // generates these virtual registers with whatever method is appropriate + // for its needs. In particular, FastISel and SDISel do not share the way + // local virtual registers are set. + // Therefore, this is impossible (or at least unsafe) to share values + // between basic blocks unless they use the same instruction selection + // method, which is not guarantee for X86. + // Moreover, things like hasOneUse could not be used accurately, if we + // allow to reference values across basic blocks whereas they are not + // alive across basic blocks initially. + bool InMBB = true; + if (I) { + Opcode = I->getOpcode(); + U = I; + InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts if its operand is in the same BB. + if (InMBB) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::IntToPtr: + // Look past no-op inttoptrs if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints if its operand is in the same BB. + if (InMBB && + TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // RIP-relative addresses can't have additional register operands. + if (Subtarget->isPICStyleRIPRel() && + (AM.Base.Reg != 0 || AM.IndexReg != 0)) + return false; + + // Can't handle DLL Import. + if (GV->hasDLLImportStorageClass()) + return false; + + // Can't handle TLS. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Okay, we've committed to selecting this global. Set up the basic address. + AM.GV = GV; + + // No ABI requires an extra load for anything other than DLLImport, which + // we rejected above. Return a direct reference to the global. + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } else if (Subtarget->isPICStyleStubPIC()) { + AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; + } else if (Subtarget->isPICStyleGOT()) { + AM.GVOpFlags = X86II::MO_GOTOFF; + } + + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + + +/// X86SelectStore - Select and emit code to implement store instructions. +bool X86FastISel::X86SelectStore(const Instruction *I) { + // Atomic stores need special handling. + const StoreInst *S = cast<StoreInst>(I); + + if (S->isAtomic()) + return false; + + const Value *Val = S->getValueOperand(); + const Value *Ptr = S->getPointerOperand(); + + MVT VT; + if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) + return false; + + unsigned Alignment = S->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + bool Aligned = Alignment >= ABIAlignment; + + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); +} + +/// X86SelectRet - Select and emit code to implement ret instructions. +bool X86FastISel::X86SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + const X86MachineFunctionInfo *X86MFInfo = + FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); + + if (!FuncInfo.CanLowerReturn) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::C && + CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall && + CC != CallingConv::X86_64_SysV) + return false; + + if (Subtarget->isCallingConvWin64(CC)) + return false; + + // Don't handle popping bytes on return for now. + if (X86MFInfo->getBytesToPopOnReturn() != 0) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + // Let SDISel handle vararg functions. + if (F.isVarArg()) + return false; + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + // The calling-convention tables for x87 returns don't tell + // the whole story. + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + // The x86-64 ABI for returning structs by value requires that we copy + // the sret argument into %rax for the return. We saved the argument into + // a virtual register in the entry block, so now we copy the value out + // and into %rax. We also do the same with %eax for Win32. + if (F.hasStructRetAttr() && + (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { + unsigned Reg = X86MFInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()!"); + unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); + RetRegs.push_back(RetReg); + } + + // Now emit the RET. + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + return true; +} + +/// X86SelectLoad - Select and emit code to implement load instructions. +/// +bool X86FastISel::X86SelectLoad(const Instruction *I) { + const LoadInst *LI = cast<LoadInst>(I); + + // Atomic loads need special handling. + if (LI->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) + return false; + + const Value *Ptr = LI->getPointerOperand(); + + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + unsigned ResultReg = 0; + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { + bool HasAVX = Subtarget->hasAVX(); + bool X86ScalarSSEf32 = Subtarget->hasSSE1(); + bool X86ScalarSSEf64 = Subtarget->hasSSE2(); + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: return X86::CMP8rr; + case MVT::i16: return X86::CMP16rr; + case MVT::i32: return X86::CMP32rr; + case MVT::i64: return X86::CMP64rr; + case MVT::f32: + return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; + case MVT::f64: + return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; + } +} + +/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS +/// of the comparison, return an opcode that works for the compare (e.g. +/// CMP32ri) otherwise return 0. +static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + switch (VT.getSimpleVT().SimpleTy) { + // Otherwise, we can't fold the immediate into this comparison. + default: return 0; + case MVT::i8: return X86::CMP8ri; + case MVT::i16: return X86::CMP16ri; + case MVT::i32: return X86::CMP32ri; + case MVT::i64: + // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext + // field. + if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + return X86::CMP64ri32; + return 0; + } +} + +bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, + EVT VT, DebugLoc CurDbgLoc) { + unsigned Op0Reg = getRegForValue(Op0); + if (Op0Reg == 0) return false; + + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Op1)) + Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); + + // We have two options: compare with register or immediate. If the RHS of + // the compare is an immediate that we can fold into this compare, use + // CMPri, otherwise use CMPrr. + if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) + .addReg(Op0Reg) + .addImm(Op1C->getSExtValue()); + return true; + } + } + + unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); + if (CompareOpc == 0) return false; + + unsigned Op1Reg = getRegForValue(Op1); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) + .addReg(Op0Reg) + .addReg(Op1Reg); + + return true; +} + +bool X86FastISel::X86SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: { + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), + ResultReg); + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) + return false; + break; + } + case CmpInst::FCMP_TRUE: { + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + ResultReg).addImm(1); + break; + } + } + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *RHSC = dyn_cast<ConstantFP>(RHS); + if (RHSC && RHSC->isNullValue()) + RHS = LHS; + } + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETEr, X86::SETNPr, X86::AND8rr }, + { X86::SETNEr, X86::SETPr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; + case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; + } + + ResultReg = createResultReg(&X86::GR8RegClass); + if (SETFOpc) { + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) + return false; + + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), + ResultReg).addReg(FlagReg1).addReg(FlagReg2); + updateValueMap(I, ResultReg); + return true; + } + + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + unsigned Opc = X86::getSETFromCond(CC); + + if (SwapArgs) + std::swap(LHS, RHS); + + // Emit a compare of LHS/RHS. + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectZExt(const Instruction *I) { + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Handle zero-extension from i1 to i8, which is common. + MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); + if (SrcVT.SimpleTy == MVT::i1) { + // Set the high bits to zero. + ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + + if (ResultReg == 0) + return false; + } + + if (DstVT == MVT::i64) { + // Handle extension to 64-bits via sub-register shenanigans. + unsigned MovInst; + + switch (SrcVT.SimpleTy) { + case MVT::i8: MovInst = X86::MOVZX32rr8; break; + case MVT::i16: MovInst = X86::MOVZX32rr16; break; + case MVT::i32: MovInst = X86::MOV32rr; break; + default: llvm_unreachable("Unexpected zext to i64 source type"); + } + + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) + .addReg(ResultReg); + + ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), + ResultReg) + .addImm(0).addReg(Result32).addImm(X86::sub_32bit); + } else if (DstVT != MVT::i8) { + ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; + } + + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectBranch(const Instruction *I) { + // Unconditional branches are selected by tablegen-generated code. + // Handle a conditional branch. + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Fold the common case of a conditional branch with a comparison + // in the same block (values defined on other blocks may not have + // initialized registers). + X86::CondCode CC; + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && CI->getParent() == I->getParent()) { + EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; + } + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, + // 0.0. + // We don't have to materialize a zero constant for this case and can just + // use %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition + // code check. Instead two branch instructions are required to check all + // the flags. First we change the predicate to a supported condition code, + // which will be the first branch. Later one we will emit the second + // branch. + bool NeedExtraBranch = false; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + std::swap(TrueMBB, FalseMBB); // fall-through + case CmpInst::FCMP_UNE: + NeedExtraBranch = true; + Predicate = CmpInst::FCMP_ONE; + break; + } + + bool SwapArgs; + unsigned BranchOpc; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + BranchOpc = X86::GetCondBranchFromCond(CC); + if (SwapArgs) + std::swap(CmpLHS, CmpRHS); + + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + + // X86 requires a second branch to handle UNE (and OEQ, which is mapped + // to UNE above). + if (NeedExtraBranch) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) + .addMBB(TrueMBB); + } + + // Obtain the branch weight and add the TrueBB to the successor list. + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + // Emits an unconditional branch to the FalseBB, obtains the branch + // weight, and adds it to the successor list. + fastEmitBranch(FalseMBB, DbgLoc); + + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); + + unsigned JmpOpc = X86::JNE_1; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_1; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) + .addMBB(TrueMBB); + fastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; + } + } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + fastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; + } + + // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. + unsigned OpReg = getRegForValue(BI->getCondition()); + if (OpReg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) + .addMBB(TrueMBB); + fastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; +} + +bool X86FastISel::X86SelectShift(const Instruction *I) { + unsigned CReg = 0, OpReg = 0; + const TargetRegisterClass *RC = nullptr; + if (I->getType()->isIntegerTy(8)) { + CReg = X86::CL; + RC = &X86::GR8RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(16)) { + CReg = X86::CX; + RC = &X86::GR16RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(32)) { + CReg = X86::ECX; + RC = &X86::GR32RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(64)) { + CReg = X86::RCX; + RC = &X86::GR64RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; + default: return false; + } + } else { + return false; + } + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), + CReg).addReg(Op1Reg); + + // The shift instruction uses X86::CL. If we defined a super-register + // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. + if (CReg != X86::CL) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) + .addReg(Op0Reg); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectDivRem(const Instruction *I) { + const static unsigned NumTypes = 4; // i8, i16, i32, i64 + const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem + const static bool S = true; // IsSigned + const static bool U = false; // !IsSigned + const static unsigned Copy = TargetOpcode::COPY; + // For the X86 DIV/IDIV instruction, in most cases the dividend + // (numerator) must be in a specific register pair highreg:lowreg, + // producing the quotient in lowreg and the remainder in highreg. + // For most data types, to set up the instruction, the dividend is + // copied into lowreg, and lowreg is sign-extended or zero-extended + // into highreg. The exception is i8, where the dividend is defined + // as a single register rather than a register pair, and we + // therefore directly sign-extend or zero-extend the dividend into + // lowreg, instead of copying, and ignore the highreg. + const static struct DivRemEntry { + // The following portion depends only on the data type. + const TargetRegisterClass *RC; + unsigned LowInReg; // low part of the register pair + unsigned HighInReg; // high part of the register pair + // The following portion depends on both the data type and the operation. + struct DivRemResult { + unsigned OpDivRem; // The specific DIV/IDIV opcode to use. + unsigned OpSignExtend; // Opcode for sign-extending lowreg into + // highreg, or copying a zero into highreg. + unsigned OpCopy; // Opcode for copying dividend into lowreg, or + // zero/sign-extending into lowreg for i8. + unsigned DivRemResultReg; // Register containing the desired result. + bool IsOpSigned; // Whether to use signed or unsigned form. + } ResultTable[NumOps]; + } OpTable[NumTypes] = { + { &X86::GR8RegClass, X86::AX, 0, { + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem + } + }, // i8 + { &X86::GR16RegClass, X86::AX, X86::DX, { + { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv + { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem + { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv + { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem + } + }, // i16 + { &X86::GR32RegClass, X86::EAX, X86::EDX, { + { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv + { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem + { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv + { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem + } + }, // i32 + { &X86::GR64RegClass, X86::RAX, X86::RDX, { + { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv + { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem + { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem + } + }, // i64 + }; + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned TypeIndex, OpIndex; + switch (VT.SimpleTy) { + default: return false; + case MVT::i8: TypeIndex = 0; break; + case MVT::i16: TypeIndex = 1; break; + case MVT::i32: TypeIndex = 2; break; + case MVT::i64: TypeIndex = 3; + if (!Subtarget->is64Bit()) + return false; + break; + } + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected div/rem opcode"); + case Instruction::SDiv: OpIndex = 0; break; + case Instruction::SRem: OpIndex = 1; break; + case Instruction::UDiv: OpIndex = 2; break; + case Instruction::URem: OpIndex = 3; break; + } + + const DivRemEntry &TypeEntry = OpTable[TypeIndex]; + const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) + return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) + return false; + + // Move op0 into low-order input register. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); + // Zero-extend or sign-extend into high-order input register. + if (OpEntry.OpSignExtend) { + if (OpEntry.IsOpSigned) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpSignExtend)); + else { + unsigned Zero32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::MOV32r0), Zero32); + + // Copy the zero into the appropriate sub/super/identical physical + // register. Unfortunately the operations needed are not uniform enough + // to fit neatly into the table above. + if (VT.SimpleTy == MVT::i16) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32, 0, X86::sub_16bit); + } else if (VT.SimpleTy == MVT::i32) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32); + } else if (VT.SimpleTy == MVT::i64) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) + .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); + } + } + } + // Generate the DIV/IDIV instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); + // For i8 remainder, we can't reference AH directly, as we'll end + // up with bogus copies like %R9B = COPY %AH. Reference AX + // instead to prevent AH references in a REX instruction. + // + // The current assumption of the fast register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + unsigned ResultReg = 0; + if ((I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem) && + OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { + unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); + unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), SourceSuperReg).addReg(X86::AX); + + // Shift AX right by 8 bits instead of using AH. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), + ResultSuperReg).addReg(SourceSuperReg).addImm(8); + + // Now reference the 8-bit subreg of the result. + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + /*Kill=*/true, X86::sub_8bit); + } + // Copy the result out of the physreg if we haven't already. + if (!ResultReg) { + ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) + .addReg(OpEntry.DivRemResultReg); + } + updateValueMap(I, ResultReg); + + return true; +} + +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) + return false; + + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) + return false; + + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + updateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) + return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64))) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + static unsigned OpcTable[2][2][4] = { + { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, + { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, + { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } + }; + + bool HasAVX = Subtarget->hasAVX(); + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; + case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + updateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + +bool X86FastISel::X86SelectFPExt(const Instruction *I) { + // fpext from float to double. + if (X86ScalarSSEf64 && + I->getType()->isDoubleTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isFloatTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(&X86::FR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::CVTSS2SDrr), ResultReg) + .addReg(OpReg); + updateValueMap(I, ResultReg); + return true; + } + } + + return false; +} + +bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { + if (X86ScalarSSEf64) { + if (I->getType()->isFloatTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isDoubleTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(&X86::FR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::CVTSD2SSrr), ResultReg) + .addReg(OpReg); + updateValueMap(I, ResultReg); + return true; + } + } + } + + return false; +} + +bool X86FastISel::X86SelectTrunc(const Instruction *I) { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + + // This code only handles truncation to byte. + if (DstVT != MVT::i8 && DstVT != MVT::i1) + return false; + if (!TLI.isTypeLegal(SrcVT)) + return false; + + unsigned InputReg = getRegForValue(I->getOperand(0)); + if (!InputReg) + // Unhandled operand. Halt "fast" selection and bail. + return false; + + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + updateValueMap(I, InputReg); + return true; + } + + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = + (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); + InputReg = CopyReg; + } + + // Issue an extract_subreg. + unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, + InputReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} + +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else + VT = MVT::i8; + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); + RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; + } + + return true; +} + +bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { + // FIXME: Handle more intrinsics. + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::frameaddress: { + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC = nullptr; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Invalid result type for frameaddress."); + case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; + case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; + } + + // This needs to be set before we call getPtrSizedFrameRegister, otherwise + // we get the wrong frame register. + MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF)); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + + // Always make a copy of the frame register to to a vreg first, so that we + // never directly reference the frame register (the TwoAddressInstruction- + // Pass doesn't like that). + unsigned SrcReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); + + // Now recursively load from the frame address. + // movq (%rbp), %rax + // movq (%rax), %rax + // movq (%rax), %rax + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg), SrcReg); + SrcReg = DestReg; + } + + updateValueMap(II, SrcReg); + return true; + } + case Intrinsic::memcpy: { + const MemCpyInst *MCI = cast<MemCpyInst>(II); + // Don't handle volatile or variable length memcpys. + if (MCI->isVolatile()) + return false; + + if (isa<ConstantInt>(MCI->getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI->getRawDest(), DestAM) || + !X86SelectAddress(MCI->getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) + return false; + + return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); + } + case Intrinsic::memset: { + const MemSetInst *MSI = cast<MemSetInst>(II); + + if (MSI->isVolatile()) + return false; + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI->getDestAddressSpace() > 255) + return false; + + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); + } + case Intrinsic::stackprotector: { + // Emit code to store the stack guard onto the stack. + EVT PtrTy = TLI.getPointerTy(); + + const Value *Op1 = II->getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); + + MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); + + // Grab the frame index. + X86AddressMode AM; + if (!X86SelectAddress(Slot, AM)) return false; + if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; + return true; + } + case Intrinsic::dbg_declare: { + const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); + X86AddressMode AM; + assert(DI->getAddress() && "Null address should be checked earlier!"); + if (!X86SelectAddress(DI->getAddress(), AM)) + return false; + const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); + // FIXME may need to add RegState::Debug to any registers produced, + // although ESP/EBP should be the only ones at the moment. + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) + .addImm(0) + .addMetadata(DI->getVariable()) + .addMetadata(DI->getExpression()); + return true; + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); + return true; + } + case Intrinsic::sqrt: { + if (!Subtarget->hasSSE1()) + return false; + + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT + // is not generated by FastISel yet. + // FIXME: Update this code once tablegen can handle it. + static const unsigned SqrtOpc[2][2] = { + {X86::SQRTSSr, X86::VSQRTSSr}, + {X86::SQRTSDr, X86::VSQRTSDr} + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; + case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; + } + + const Value *SrcVal = II->getArgOperand(0); + unsigned SrcReg = getRegForValue(SrcVal); + + if (SrcReg == 0) + return false; + + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg); + + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg); + + MIB.addReg(SrcReg); + + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul followed by either seto or setb. + const Function *Callee = II->getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT < MVT::i8 || VT > MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + bool UseIncDec = false; + if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne()) + UseIncDec = true; + + unsigned BaseOpc, CondOpc; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); + CondOpc = X86::SETOr; + break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); + CondOpc = X86::SETOr; + break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { + static const unsigned Opc[2][4] = { + { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, + { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } + }; + + if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { + ResultReg = createResultReg(TLI.getRegClassFor(VT)); + bool IsDec = BaseOpc == X86ISD::DEC; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + } else + ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + CI->getZExtValue()); + } + + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + } + + if (!ResultReg) + return false; + + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); + + updateValueMap(II, ResultReg, 2); + return true; + } + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + bool IsInputDouble; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + if (!Subtarget->hasSSE1()) + return false; + IsInputDouble = false; + break; + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (!Subtarget->hasSSE2()) + return false; + IsInputDouble = true; + break; + } + + Type *RetTy = II->getCalledFunction()->getReturnType(); + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + static const unsigned CvtOpc[2][2][2] = { + { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, + { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, + { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, + { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected result type."); + case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; + case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; + } + + // Check if we can fold insertelement instructions into the convert. + const Value *Op = II->getArgOperand(0); + while (auto *IE = dyn_cast<InsertElementInst>(Op)) { + const Value *Index = IE->getOperand(2); + if (!isa<ConstantInt>(Index)) + break; + unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); + + if (Idx == 0) { + Op = IE->getOperand(1); + break; + } + Op = IE->getOperand(0); + } + + unsigned Reg = getRegForValue(Op); + if (Reg == 0) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Reg); + + updateValueMap(II, ResultReg); + return true; + } + } +} + +bool X86FastISel::fastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + if (CC != CallingConv::C) + return false; + + if (Subtarget->isCallingConvWin64(CC)) + return false; + + if (!Subtarget->is64Bit()) + return false; + + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; + if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || + F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + return false; + + Type *ArgTy = Arg.getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) + return false; + + EVT ArgVT = TLI.getValueType(ArgTy); + if (!ArgVT.isSimple()) return false; + switch (ArgVT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i32: + case MVT::i64: + ++GPRCnt; + break; + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasSSE1()) + return false; + ++FPRCnt; + break; + } + + if (GPRCnt > 6) + return false; + + if (FPRCnt > 8) + return false; + } + + static const MCPhysReg GPR32ArgRegs[] = { + X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D + }; + static const MCPhysReg GPR64ArgRegs[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 + }; + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(Arg.getType()); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned SrcReg; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; + case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; + case MVT::f32: // fall-through + case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; + } + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + updateValueMap(&Arg, ResultReg); + } + return true; +} + +static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, + CallingConv::ID CC, + ImmutableCallSite *CS) { + if (Subtarget->is64Bit()) + return 0; + if (Subtarget->getTargetTriple().isOSMSVCRT()) + return 0; + if (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE) + return 0; + if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) + return 0; + if (CS && CS->paramHasAttr(1, Attribute::InReg)) + return 0; + return 4; +} + +bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { + auto &OutVals = CLI.OutVals; + auto &OutFlags = CLI.OutFlags; + auto &OutRegs = CLI.OutRegs; + auto &Ins = CLI.Ins; + auto &InRegs = CLI.InRegs; + CallingConv::ID CC = CLI.CallConv; + bool &IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + const char *SymName = CLI.SymName; + + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CC); + + // Handle only C, fastcc, and webkit_js calling conventions for now. + switch (CC) { + default: return false; + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::WebKit_JS: + case CallingConv::X86_FastCall: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + break; + } + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (IsVarArg && IsWin64) + return false; + + // Don't know about inalloca yet. + if (CLI.CS && CLI.CS->hasInAllocaArgument()) + return false; + + // Fast-isel doesn't know about callee-pop yet. + if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, + TM.Options.GuaranteedTailCallOpt)) + return false; + + SmallVector<MVT, 16> OutVTs; + SmallVector<unsigned, 16> ArgRegs; + + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all FastISel supported + // calling conventions on x86. + for (int i = 0, e = OutVals.size(); i != e; ++i) { + Value *&Val = OutVals[i]; + ISD::ArgFlagsTy Flags = OutFlags[i]; + if (auto *CI = dyn_cast<ConstantInt>(Val)) { + if (CI->getBitWidth() < 32) { + if (Flags.isSExt()) + Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); + else + Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); + } + } + + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + MVT VT; + auto *TI = dyn_cast<TruncInst>(Val); + unsigned ResultReg; + if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Value *PrevVal = TI->getOperand(0); + ResultReg = getRegForValue(PrevVal); + + if (!ResultReg) + return false; + + if (!isTypeLegal(PrevVal->getType(), VT)) + return false; + + ResultReg = + fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); + } else { + if (!isTypeLegal(Val->getType(), VT)) + return false; + ResultReg = getRegForValue(Val); + } + + if (!ResultReg) + return false; + + ArgRegs.push_back(ResultReg); + OutVTs.push_back(VT); + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) + .addImm(NumBytes); + + // Walk the register/memloc assignments, inserting copies/loads. + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign const &VA = ArgLocs[i]; + const Value *ArgVal = OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + if (ArgVT == MVT::x86mmx) + return false; + + unsigned ArgReg = ArgRegs[VA.getValNo()]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, + /*TODO: Kill=*/false); + assert(ArgReg && "Failed to emit a bitcast!"); + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::VExt: + // VExt has not been implemented, so this should be impossible to reach + // for now. However, fallback to Selection DAG isel once implemented. + return false; + case CCValAssign::AExtUpper: + case CCValAssign::SExtUpper: + case CCValAssign::ZExtUpper: + case CCValAssign::FPExt: + llvm_unreachable("Unexpected loc info!"); + case CCValAssign::Indirect: + // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully + // support this. + return false; + } + + if (VA.isRegLoc()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + OutRegs.push_back(VA.getLocReg()); + } else { + assert(VA.isMemLoc()); + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + + unsigned LocMemOffset = VA.getLocMemOffset(); + X86AddressMode AM; + AM.Base.Reg = RegInfo->getStackRegister(); + AM.Disp = LocMemOffset; + ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, + ArgVT.getStoreSize(), Alignment); + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = ArgReg; + if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) + return false; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + // of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. + if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) + return false; + } else { + bool ValIsKill = hasTrivialKill(ArgVal); + if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) + return false; + } + } + } + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (Subtarget->isPICStyleGOT()) { + unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); + } + + if (Is64Bit && IsVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + + unsigned CalleeOp = 0; + const GlobalValue *GV = nullptr; + if (CalleeAM.GV != nullptr) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + + // Issue the call. + MachineInstrBuilder MIB; + if (CalleeOp) { + // Register-indirect call. + unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) + .addReg(CalleeOp); + } else { + // Direct call. + assert(GV && "Not a direct call"); + unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + + // See if we need any target-specific flags on the GV operand. + unsigned char OpFlags = 0; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (GV->isDeclaration() || GV->isWeakForLinker()) && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); + if (SymName) + MIB.addExternalSymbol(SymName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); + } + + // Add a register mask operand representing the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CC)); + + // Add an implicit use GOT pointer in EBX. + if (Subtarget->isPICStyleGOT()) + MIB.addReg(X86::EBX, RegState::Implicit); + + if (Is64Bit && IsVarArg && !IsWin64) + MIB.addReg(X86::AL, RegState::Implicit); + + // Add implicit physical register uses to the call. + for (auto Reg : OutRegs) + MIB.addReg(Reg, RegState::Implicit); + + // Issue CALLSEQ_END + unsigned NumBytesForCalleeToPop = + computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(NumBytesForCalleeToPop); + + // Now handle call return values. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, + CLI.RetTy->getContext()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); + unsigned CopyReg = ResultReg + i; + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(&X86::RFP80RegClass); + } + + // Copy out the result. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + InRegs.push_back(VA.getLocReg()); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. This is accomplished by storing the f80 value in memory + // and then loading it back. + if (CopyVT != VA.getValVT()) { + EVT ResVT = VA.getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg + i), FI); + } + } + + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = RVLocs.size(); + CLI.Call = MIB; + + return true; +} + +bool +X86FastISel::fastSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: break; + case Instruction::Load: + return X86SelectLoad(I); + case Instruction::Store: + return X86SelectStore(I); + case Instruction::Ret: + return X86SelectRet(I); + case Instruction::ICmp: + case Instruction::FCmp: + return X86SelectCmp(I); + case Instruction::ZExt: + return X86SelectZExt(I); + case Instruction::Br: + return X86SelectBranch(I); + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Shl: + return X86SelectShift(I); + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return X86SelectDivRem(I); + case Instruction::Select: + return X86SelectSelect(I); + case Instruction::Trunc: + return X86SelectTrunc(I); + case Instruction::FPExt: + return X86SelectFPExt(I); + case Instruction::FPTrunc: + return X86SelectFPTrunc(I); + case Instruction::IntToPtr: // Deliberate fall-through. + case Instruction::PtrToInt: { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + if (DstVT.bitsGT(SrcVT)) + return X86SelectZExt(I); + if (DstVT.bitsLT(SrcVT)) + return X86SelectTrunc(I); + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) return false; + updateValueMap(I, Reg); + return true; + } + } + + return false; +} + +unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) + return 0; + + uint64_t Imm = CI->getZExtValue(); + if (Imm == 0) { + unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: + case MVT::i8: + return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); + case MVT::i16: + return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, + X86::sub_16bit); + case MVT::i32: + return SrcReg; + case MVT::i64: { + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + } + } + + unsigned Opc = 0; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i8: Opc = X86::MOV8ri; break; + case MVT::i16: Opc = X86::MOV16ri; break; + case MVT::i32: Opc = X86::MOV32ri; break; + case MVT::i64: { + if (isUInt<32>(Imm)) + Opc = X86::MOV32ri; + else if (isInt<32>(Imm)) + Opc = X86::MOV64ri32; + else + Opc = X86::MOV64ri; + break; + } + } + if (VT == MVT::i64 && Opc == X86::MOV32ri) { + unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); +} + +unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + + // Can't handle alternate code models yet. + CodeModel::Model CM = TM.getCodeModel(); + if (CM != CodeModel::Small && CM != CodeModel::Large) + return 0; + + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.SimpleTy) { + default: return 0; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + // MachineConstantPool wants an explicit alignment. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) { + // Alignment of vector types. FIXME! + Align = DL.getTypeAllocSize(CFP->getType()); + } + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + unsigned char OpFlag = 0; + if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic + OpFlag = X86II::MO_PIC_BASE_OFFSET; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOTOFF; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleRIPRel() && + TM.getCodeModel() == CodeModel::Small) { + PICBase = X86::RIP; + } + + // Create the load from the constant pool. + unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); + unsigned ResultReg = createResultReg(RC); + + if (CM == CodeModel::Large) { + unsigned AddrReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + AddrReg) + .addConstantPoolIndex(CPI, 0, OpFlag); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + addDirectMem(MIB, AddrReg); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + TM.getDataLayout()->getPointerSize(), Align); + MIB->addMemOperand(*FuncInfo.MF, MMO); + return ResultReg; + } + + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), + CPI, PICBase, OpFlag); + return ResultReg; +} + +unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + // Materialize addresses with LEA/MOV instructions. + X86AddressMode AM; + if (X86SelectAddress(GV, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) + return AM.Base.Reg; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code could be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + ResultReg) + .addGlobalAddress(GV); + } else { + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + } + return ResultReg; + } + return 0; +} + +unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return X86MaterializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return X86MaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return X86MaterializeGV(GV, VT); + + return 0; +} + +unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { + // Fail on dynamic allocas. At this point, getRegForValue has already + // checked its CSE maps, so if we're here trying to handle a dynamic + // alloca, we're not going to succeed. X86SelectAddress has a + // check for dynamic allocas, because it's called directly from + // various places, but targetMaterializeAlloca also needs a check + // in order to avoid recursion between getRegForValue, + // X86SelectAddrss, and targetMaterializeAlloca. + if (!FuncInfo.StaticAllocaMap.count(C)) + return 0; + assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); + + X86AddressMode AM; + if (!X86SelectAddress(C, AM)) + return 0; + unsigned Opc = TLI.getPointerTy() == MVT::i32 + ? (Subtarget->isTarget64BitILP32() + ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; + const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + return ResultReg; +} + +unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return 0; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.SimpleTy) { + default: return 0; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = X86::FsFLD0SS; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp032; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = X86::FsFLD0SD; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp064; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + return ResultReg; +} + + +bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + const Value *Ptr = LI->getPointerOperand(); + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + const X86InstrInfo &XII = (const X86InstrInfo &)TII; + + unsigned Size = DL.getTypeAllocSize(LI->getType()); + unsigned Alignment = LI->getAlignment(); + + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = DL.getABITypeAlignment(LI->getType()); + + SmallVector<MachineOperand, 8> AddrOps; + AM.getFullAddress(AddrOps); + + MachineInstr *Result = + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, + Size, Alignment, /*AllowCommute=*/true); + if (!Result) + return false; + + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); + FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); + MI->eraseFromParent(); + return true; +} + + +namespace llvm { + FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) { + return new X86FastISel(funcInfo, libInfo); + } +} diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 80b141654c0..930163c3688 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,2020 +1,2103 @@ -//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the X86 implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86FrameLowering.h"
-#include "X86InstrBuilder.h"
-#include "X86InstrInfo.h"
-#include "X86MachineFunctionInfo.h"
-#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Support/Debug.h"
-#include <cstdlib>
-
-using namespace llvm;
-
-// FIXME: completely move here.
-extern cl::opt<bool> ForceStackAlign;
-
-bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo()->hasVarSizedObjects() &&
- !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
-}
-
-/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
-/// call frame pseudos can be simplified. Having a FP, as in the default
-/// implementation, is not sufficient here since we can't always use it.
-/// Use a more nuanced condition.
-bool
-X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
- const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
- (MF.getSubtarget().getRegisterInfo());
- return hasReservedCallFrame(MF) ||
- (hasFP(MF) && !TRI->needsStackRealignment(MF))
- || TRI->hasBasePointer(MF);
-}
-
-// needsFrameIndexResolution - Do we need to perform FI resolution for
-// this function. Normally, this is required only when the function
-// has any stack objects. However, FI resolution actually has another job,
-// not apparent from the title - it resolves callframesetup/destroy
-// that were not simplified earlier.
-// So, this is required for x86 functions that have push sequences even
-// when there are no stack objects.
-bool
-X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
- return MF.getFrameInfo()->hasStackObjects() ||
- MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register. This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- const MachineModuleInfo &MMI = MF.getMMI();
- const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-
- return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
- RegInfo->needsStackRealignment(MF) ||
- MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
- MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
- MMI.callsUnwindInit() || MMI.callsEHReturn() ||
- MFI->hasStackMap() || MFI->hasPatchPoint());
-}
-
-static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
- if (IsLP64) {
- if (isInt<8>(Imm))
- return X86::SUB64ri8;
- return X86::SUB64ri32;
- } else {
- if (isInt<8>(Imm))
- return X86::SUB32ri8;
- return X86::SUB32ri;
- }
-}
-
-static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
- if (IsLP64) {
- if (isInt<8>(Imm))
- return X86::ADD64ri8;
- return X86::ADD64ri32;
- } else {
- if (isInt<8>(Imm))
- return X86::ADD32ri8;
- return X86::ADD32ri;
- }
-}
-
-static unsigned getSUBrrOpcode(unsigned isLP64) {
- return isLP64 ? X86::SUB64rr : X86::SUB32rr;
-}
-
-static unsigned getADDrrOpcode(unsigned isLP64) {
- return isLP64 ? X86::ADD64rr : X86::ADD32rr;
-}
-
-static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
- if (IsLP64) {
- if (isInt<8>(Imm))
- return X86::AND64ri8;
- return X86::AND64ri32;
- }
- if (isInt<8>(Imm))
- return X86::AND32ri8;
- return X86::AND32ri;
-}
-
-static unsigned getLEArOpcode(unsigned IsLP64) {
- return IsLP64 ? X86::LEA64r : X86::LEA32r;
-}
-
-/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
-/// when it reaches the "return" instruction. We can then pop a stack object
-/// to this register without worry about clobbering it.
-static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- const TargetRegisterInfo &TRI,
- bool Is64Bit) {
- const MachineFunction *MF = MBB.getParent();
- const Function *F = MF->getFunction();
- if (!F || MF->getMMI().callsEHReturn())
- return 0;
-
- static const uint16_t CallerSavedRegs32Bit[] = {
- X86::EAX, X86::EDX, X86::ECX, 0
- };
-
- static const uint16_t CallerSavedRegs64Bit[] = {
- X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
- X86::R8, X86::R9, X86::R10, X86::R11, 0
- };
-
- unsigned Opc = MBBI->getOpcode();
- switch (Opc) {
- default: return 0;
- case X86::RETL:
- case X86::RETQ:
- case X86::RETIL:
- case X86::RETIQ:
- case X86::TCRETURNdi:
- case X86::TCRETURNri:
- case X86::TCRETURNmi:
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- case X86::EH_RETURN:
- case X86::EH_RETURN64: {
- SmallSet<uint16_t, 8> Uses;
- for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
- MachineOperand &MO = MBBI->getOperand(i);
- if (!MO.isReg() || MO.isDef())
- continue;
- unsigned Reg = MO.getReg();
- if (!Reg)
- continue;
- for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
- Uses.insert(*AI);
- }
-
- const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
- for (; *CS; ++CS)
- if (!Uses.count(*CS))
- return *CS;
- }
- }
-
- return 0;
-}
-
-static bool isEAXLiveIn(MachineFunction &MF) {
- for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
- EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
- unsigned Reg = II->first;
-
- if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX ||
- Reg == X86::AH || Reg == X86::AL)
- return true;
- }
-
- return false;
-}
-
-/// emitSPUpdate - Emit a series of instructions to increment / decrement the
-/// stack pointer by a constant value.
-static
-void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, int64_t NumBytes,
- bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA,
- const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
- bool isSub = NumBytes < 0;
- uint64_t Offset = isSub ? -NumBytes : NumBytes;
- unsigned Opc;
- if (UseLEA)
- Opc = getLEArOpcode(Is64BitStackPtr);
- else
- Opc = isSub
- ? getSUBriOpcode(Is64BitStackPtr, Offset)
- : getADDriOpcode(Is64BitStackPtr, Offset);
-
- uint64_t Chunk = (1LL << 31) - 1;
- DebugLoc DL = MBB.findDebugLoc(MBBI);
-
- while (Offset) {
- if (Offset > Chunk) {
- // Rather than emit a long series of instructions for large offsets,
- // load the offset into a register and do one sub/add
- unsigned Reg = 0;
-
- if (isSub && !isEAXLiveIn(*MBB.getParent()))
- Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX);
- else
- Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
-
- if (Reg) {
- Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri;
- BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
- .addImm(Offset);
- Opc = isSub
- ? getSUBrrOpcode(Is64BitTarget)
- : getADDrrOpcode(Is64BitTarget);
- MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr)
- .addReg(Reg);
- MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
- Offset = 0;
- continue;
- }
- }
-
- uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
- if (ThisVal == (Is64BitTarget ? 8 : 4)) {
- // Use push / pop instead.
- unsigned Reg = isSub
- ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX)
- : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
- if (Reg) {
- Opc = isSub
- ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r)
- : (Is64BitTarget ? X86::POP64r : X86::POP32r);
- MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
- .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
- if (isSub)
- MI->setFlag(MachineInstr::FrameSetup);
- Offset -= ThisVal;
- continue;
- }
- }
-
- MachineInstr *MI = nullptr;
-
- if (UseLEA) {
- MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
- StackPtr, false, isSub ? -ThisVal : ThisVal);
- } else {
- MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr)
- .addImm(ThisVal);
- MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
- }
-
- if (isSub)
- MI->setFlag(MachineInstr::FrameSetup);
-
- Offset -= ThisVal;
- }
-}
-
-/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
-static
-void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, uint64_t *NumBytes = nullptr) {
- if (MBBI == MBB.begin()) return;
-
- MachineBasicBlock::iterator PI = std::prev(MBBI);
- unsigned Opc = PI->getOpcode();
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
- Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
- PI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes += PI->getOperand(2).getImm();
- MBB.erase(PI);
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
- Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
- PI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes -= PI->getOperand(2).getImm();
- MBB.erase(PI);
- }
-}
-
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
-/// iterator.
-static
-void mergeSPUpdatesDown(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI,
- unsigned StackPtr, uint64_t *NumBytes = nullptr) {
- // FIXME: THIS ISN'T RUN!!!
- return;
-
- if (MBBI == MBB.end()) return;
-
- MachineBasicBlock::iterator NI = std::next(MBBI);
- if (NI == MBB.end()) return;
-
- unsigned Opc = NI->getOpcode();
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
- NI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes -= NI->getOperand(2).getImm();
- MBB.erase(NI);
- MBBI = NI;
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
- Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
- NI->getOperand(0).getReg() == StackPtr) {
- if (NumBytes)
- *NumBytes += NI->getOperand(2).getImm();
- MBB.erase(NI);
- MBBI = NI;
- }
-}
-
-/// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
-/// the stack adjustment is returned as a positive value for ADD/LEA and a
-/// negative for SUB.
-static int mergeSPUpdates(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
- bool doMergeWithPrevious) {
- if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
- (!doMergeWithPrevious && MBBI == MBB.end()))
- return 0;
-
- MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
- MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
- : std::next(MBBI);
- unsigned Opc = PI->getOpcode();
- int Offset = 0;
-
- if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
- Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
- Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
- PI->getOperand(0).getReg() == StackPtr){
- Offset += PI->getOperand(2).getImm();
- MBB.erase(PI);
- if (!doMergeWithPrevious) MBBI = NI;
- } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
- Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
- PI->getOperand(0).getReg() == StackPtr) {
- Offset -= PI->getOperand(2).getImm();
- MBB.erase(PI);
- if (!doMergeWithPrevious) MBBI = NI;
- }
-
- return Offset;
-}
-
-void
-X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) const {
- MachineFunction &MF = *MBB.getParent();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- MachineModuleInfo &MMI = MF.getMMI();
- const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-
- // Add callee saved registers to move list.
- const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
- if (CSI.empty()) return;
-
- // Calculate offsets.
- for (std::vector<CalleeSavedInfo>::const_iterator
- I = CSI.begin(), E = CSI.end(); I != E; ++I) {
- int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
- unsigned Reg = I->getReg();
-
- unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- unsigned CFIIndex =
- MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
- Offset));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-}
-
-/// usesTheStack - This function checks if any of the users of EFLAGS
-/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
-/// to use the stack, and if we don't adjust the stack we clobber the first
-/// frame index.
-/// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(const MachineFunction &MF) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
-
- for (MachineRegisterInfo::reg_instr_iterator
- ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
- ri != re; ++ri)
- if (ri->isCopy())
- return true;
-
- return false;
-}
-
-void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- bool Is64Bit = STI.is64Bit();
- bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
-
- unsigned CallOp;
- if (Is64Bit)
- CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32;
- else
- CallOp = X86::CALLpcrel32;
-
- const char *Symbol;
- if (Is64Bit) {
- if (STI.isTargetCygMing()) {
- Symbol = "___chkstk_ms";
- } else {
- Symbol = "__chkstk";
- }
- } else if (STI.isTargetCygMing())
- Symbol = "_alloca";
- else
- Symbol = "_chkstk";
-
- MachineInstrBuilder CI;
-
- // All current stack probes take AX and SP as input, clobber flags, and
- // preserve all registers. x86_64 probes leave RSP unmodified.
- if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
- // For the large code model, we have to call through a register. Use R11,
- // as it is scratch in all supported calling conventions.
- BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
- .addExternalSymbol(Symbol);
- CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
- } else {
- CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
- }
-
- unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
- unsigned SP = Is64Bit ? X86::RSP : X86::ESP;
- CI.addReg(AX, RegState::Implicit)
- .addReg(SP, RegState::Implicit)
- .addReg(AX, RegState::Define | RegState::Implicit)
- .addReg(SP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-
- if (Is64Bit) {
- // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
- // themselves. It also does not clobber %rax so we can reuse it when
- // adjusting %rsp.
- BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
- }
-}
-
-/// emitPrologue - Push callee-saved registers onto the stack, which
-/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
-/// space for local variables. Also emit labels used by the exception handler to
-/// generate the exception handling frames.
-
-/*
- Here's a gist of what gets emitted:
-
- ; Establish frame pointer, if needed
- [if needs FP]
- push %rbp
- .cfi_def_cfa_offset 16
- .cfi_offset %rbp, -16
- .seh_pushreg %rpb
- mov %rsp, %rbp
- .cfi_def_cfa_register %rbp
-
- ; Spill general-purpose registers
- [for all callee-saved GPRs]
- pushq %<reg>
- [if not needs FP]
- .cfi_def_cfa_offset (offset from RETADDR)
- .seh_pushreg %<reg>
-
- ; If the required stack alignment > default stack alignment
- ; rsp needs to be re-aligned. This creates a "re-alignment gap"
- ; of unknown size in the stack frame.
- [if stack needs re-alignment]
- and $MASK, %rsp
-
- ; Allocate space for locals
- [if target is Windows and allocated space > 4096 bytes]
- ; Windows needs special care for allocations larger
- ; than one page.
- mov $NNN, %rax
- call ___chkstk_ms/___chkstk
- sub %rax, %rsp
- [else]
- sub $NNN, %rsp
-
- [if needs FP]
- .seh_stackalloc (size of XMM spill slots)
- .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
- [else]
- .seh_stackalloc NNN
-
- ; Spill XMMs
- ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
- ; they may get spilled on any platform, if the current function
- ; calls @llvm.eh.unwind.init
- [if needs FP]
- [for all callee-saved XMM registers]
- movaps %<xmm reg>, -MMM(%rbp)
- [for all callee-saved XMM registers]
- .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
- ; i.e. the offset relative to (%rbp - SEHFrameOffset)
- [else]
- [for all callee-saved XMM registers]
- movaps %<xmm reg>, KKK(%rsp)
- [for all callee-saved XMM registers]
- .seh_savexmm %<xmm reg>, KKK
-
- .seh_endprologue
-
- [if needs base pointer]
- mov %rsp, %rbx
- [if needs to restore base pointer]
- mov %rsp, -MMM(%rbp)
-
- ; Emit CFI info
- [if needs FP]
- [for all callee-saved registers]
- .cfi_offset %<reg>, (offset from %rbp)
- [else]
- .cfi_def_cfa_offset (offset from RETADDR)
- [for all callee-saved registers]
- .cfi_offset %<reg>, (offset from %rsp)
-
- Notes:
- - .seh directives are emitted only for Windows 64 ABI
- - .cfi directives are emitted for all other ABIs
- - for 32-bit code, substitute %e?? registers for %r??
-*/
-
-void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
- MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
- MachineBasicBlock::iterator MBBI = MBB.begin();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const Function *Fn = MF.getFunction();
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- MachineModuleInfo &MMI = MF.getMMI();
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
- uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
- bool HasFP = hasFP(MF);
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- bool Is64Bit = STI.is64Bit();
- // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
- bool IsWin64 = STI.isTargetWin64();
- // Not necessarily synonymous with IsWin64.
- bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
- bool NeedsDwarfCFI =
- !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
- bool UseLEA = STI.useLeaForSP();
- unsigned StackAlign = getStackAlignment();
- unsigned SlotSize = RegInfo->getSlotSize();
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
- const unsigned MachineFramePtr = STI.isTarget64BitILP32() ?
- getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
- unsigned StackPtr = RegInfo->getStackRegister();
- unsigned BasePtr = RegInfo->getBaseRegister();
- DebugLoc DL;
-
- // If we're forcing a stack realignment we can't rely on just the frame
- // info, we need to know the ABI stack alignment as well in case we
- // have a call out. Otherwise just make sure we have some alignment - we'll
- // go with the minimum SlotSize.
- if (ForceStackAlign) {
- if (MFI->hasCalls())
- MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
- else if (MaxAlign < SlotSize)
- MaxAlign = SlotSize;
- }
-
- // Add RETADDR move area to callee saved frame size.
- int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
- if (TailCallReturnAddrDelta < 0)
- X86FI->setCalleeSavedFrameSize(
- X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
-
- bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
-
- // The default stack probe size is 4096 if the function has no stackprobesize
- // attribute.
- unsigned StackProbeSize = 4096;
- if (Fn->hasFnAttribute("stack-probe-size"))
- Fn->getFnAttribute("stack-probe-size")
- .getValueAsString()
- .getAsInteger(0, StackProbeSize);
-
- // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
- // function, and use up to 128 bytes of stack space, don't have a frame
- // pointer, calls, or dynamic alloca then we do not need to adjust the
- // stack pointer (we fit in the Red Zone). We also check that we don't
- // push and pop from the stack.
- if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoRedZone) &&
- !RegInfo->needsStackRealignment(MF) &&
- !MFI->hasVarSizedObjects() && // No dynamic alloca.
- !MFI->adjustsStack() && // No calls.
- !IsWin64 && // Win64 has no Red Zone
- !usesTheStack(MF) && // Don't push and pop.
- !MF.shouldSplitStack()) { // Regular stack
- uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
- if (HasFP) MinSize += SlotSize;
- StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
- MFI->setStackSize(StackSize);
- }
-
- // Insert stack pointer adjustment for later moving of return addr. Only
- // applies to tail call optimized functions where the callee argument stack
- // size is bigger than the callers.
- if (TailCallReturnAddrDelta < 0) {
- MachineInstr *MI =
- BuildMI(MBB, MBBI, DL,
- TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)),
- StackPtr)
- .addReg(StackPtr)
- .addImm(-TailCallReturnAddrDelta)
- .setMIFlag(MachineInstr::FrameSetup);
- MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
- }
-
- // Mapping for machine moves:
- //
- // DST: VirtualFP AND
- // SRC: VirtualFP => DW_CFA_def_cfa_offset
- // ELSE => DW_CFA_def_cfa
- //
- // SRC: VirtualFP AND
- // DST: Register => DW_CFA_def_cfa_register
- //
- // ELSE
- // OFFSET < 0 => DW_CFA_offset_extended_sf
- // REG < 64 => DW_CFA_offset + Reg
- // ELSE => DW_CFA_offset_extended
-
- uint64_t NumBytes = 0;
- int stackGrowth = -SlotSize;
-
- if (HasFP) {
- // Calculate required stack adjustment.
- uint64_t FrameSize = StackSize - SlotSize;
- // If required, include space for extra hidden slot for stashing base pointer.
- if (X86FI->getRestoreBasePointer())
- FrameSize += SlotSize;
- if (RegInfo->needsStackRealignment(MF)) {
- // Callee-saved registers are pushed on stack before the stack
- // is realigned.
- FrameSize -= X86FI->getCalleeSavedFrameSize();
- NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
- } else {
- NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
- }
-
- // Get the offset of the stack slot for the EBP register, which is
- // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
- // Update the frame offset adjustment.
- MFI->setOffsetAdjustment(-NumBytes);
-
- // Save EBP/RBP into the appropriate stack slot.
- BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
- .addReg(MachineFramePtr, RegState::Kill)
- .setMIFlag(MachineInstr::FrameSetup);
-
- if (NeedsDwarfCFI) {
- // Mark the place where EBP/RBP was saved.
- // Define the current CFA rule to use the provided offset.
- assert(StackSize);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
-
- // Change the rule for the FramePtr to be an "offset" rule.
- unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
- CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createOffset(nullptr,
- DwarfFramePtr, 2 * stackGrowth));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- if (NeedsWinEH) {
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
- .addImm(FramePtr)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // Update EBP with the new base value.
- BuildMI(MBB, MBBI, DL,
- TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr)
- .addReg(StackPtr)
- .setMIFlag(MachineInstr::FrameSetup);
-
- if (NeedsDwarfCFI) {
- // Mark effective beginning of when frame pointer becomes valid.
- // Define the current CFA to use the EBP/RBP register.
- unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- // Mark the FramePtr as live-in in every block.
- for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
- I->addLiveIn(MachineFramePtr);
- } else {
- NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
- }
-
- // Skip the callee-saved push instructions.
- bool PushedRegs = false;
- int StackOffset = 2 * stackGrowth;
-
- while (MBBI != MBB.end() &&
- (MBBI->getOpcode() == X86::PUSH32r ||
- MBBI->getOpcode() == X86::PUSH64r)) {
- PushedRegs = true;
- unsigned Reg = MBBI->getOperand(0).getReg();
- ++MBBI;
-
- if (!HasFP && NeedsDwarfCFI) {
- // Mark callee-saved push instruction.
- // Define the current CFA rule to use the provided offset.
- assert(StackSize);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- StackOffset += stackGrowth;
- }
-
- if (NeedsWinEH) {
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
- MachineInstr::FrameSetup);
- }
- }
-
- // Realign stack after we pushed callee-saved registers (so that we'll be
- // able to calculate their offsets from the frame pointer).
- if (RegInfo->needsStackRealignment(MF)) {
- assert(HasFP && "There should be a frame pointer if stack is realigned.");
- uint64_t Val = -MaxAlign;
- MachineInstr *MI =
- BuildMI(MBB, MBBI, DL,
- TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr)
- .addReg(StackPtr)
- .addImm(Val)
- .setMIFlag(MachineInstr::FrameSetup);
-
- // The EFLAGS implicit def is dead.
- MI->getOperand(3).setIsDead();
- }
-
- // If there is an SUB32ri of ESP immediately before this instruction, merge
- // the two. This can be the case when tail call elimination is enabled and
- // the callee has more arguments then the caller.
- NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
-
- // If there is an ADD32ri or SUB32ri of ESP immediately after this
- // instruction, merge the two instructions.
- mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
-
- // Adjust stack pointer: ESP -= numbytes.
-
- // Windows and cygwin/mingw require a prologue helper routine when allocating
- // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw
- // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the
- // stack and adjust the stack pointer in one go. The 64-bit version of
- // __chkstk is only responsible for probing the stack. The 64-bit prologue is
- // responsible for adjusting the stack pointer. Touching the stack at 4K
- // increments is necessary to ensure that the guard pages used by the OS
- // virtual memory manager are allocated in correct sequence.
- if (NumBytes >= StackProbeSize && UseStackProbe) {
- // Check whether EAX is livein for this function.
- bool isEAXAlive = isEAXLiveIn(MF);
-
- if (isEAXAlive) {
- // Sanity check that EAX is not livein for this function.
- // It should not be, so throw an assert.
- assert(!Is64Bit && "EAX is livein in x64 case!");
-
- // Save EAX
- BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
- .addReg(X86::EAX, RegState::Kill)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- if (Is64Bit) {
- // Handle the 64-bit Windows ABI case where we need to call __chkstk.
- // Function prologue is responsible for adjusting the stack pointer.
- BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
- .addImm(NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
- } else {
- // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
- // We'll also use 4 already allocated bytes for EAX.
- BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
- .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // Save a pointer to the MI where we set AX.
- MachineBasicBlock::iterator SetRAX = MBBI;
- --SetRAX;
-
- // Call __chkstk, __chkstk_ms, or __alloca.
- emitStackProbeCall(MF, MBB, MBBI, DL);
-
- // Apply the frame setup flag to all inserted instrs.
- for (; SetRAX != MBBI; ++SetRAX)
- SetRAX->setFlag(MachineInstr::FrameSetup);
-
- if (isEAXAlive) {
- // Restore EAX
- MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
- X86::EAX),
- StackPtr, false, NumBytes - 4);
- MI->setFlag(MachineInstr::FrameSetup);
- MBB.insert(MBBI, MI);
- }
- } else if (NumBytes) {
- emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr,
- UseLEA, TII, *RegInfo);
- }
-
- int SEHFrameOffset = 0;
- if (NeedsWinEH) {
- if (HasFP) {
- // We need to set frame base offset low enough such that all saved
- // register offsets would be positive relative to it, but we can't
- // just use NumBytes, because .seh_setframe offset must be <=240.
- // So we pretend to have only allocated enough space to spill the
- // non-volatile registers.
- // We don't care about the rest of stack allocation, because unwinder
- // will restore SP to (BP - SEHFrameOffset)
- for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
- int offset = MFI->getObjectOffset(Info.getFrameIdx());
- SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset));
- }
- SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
-
- // This only needs to account for XMM spill slots, GPR slots
- // are covered by the .seh_pushreg's emitted above.
- unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
- if (Size) {
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
- .addImm(Size)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
- .addImm(FramePtr)
- .addImm(SEHFrameOffset)
- .setMIFlag(MachineInstr::FrameSetup);
- } else {
- // SP will be the base register for restoring XMMs
- if (NumBytes) {
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
- .addImm(NumBytes)
- .setMIFlag(MachineInstr::FrameSetup);
- }
- }
- }
-
- // Skip the rest of register spilling code
- while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
- ++MBBI;
-
- // Emit SEH info for non-GPRs
- if (NeedsWinEH) {
- for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
- unsigned Reg = Info.getReg();
- if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
- continue;
- assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
-
- int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
- Offset += SEHFrameOffset;
-
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
- .addImm(Reg)
- .addImm(Offset)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // If we need a base pointer, set it up here. It's whatever the value
- // of the stack pointer is at this point. Any variable size objects
- // will be allocated after this, so we can still use the base pointer
- // to reference locals.
- if (RegInfo->hasBasePointer(MF)) {
- // Update the base pointer with the current stack pointer.
- unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
- BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
- .addReg(StackPtr)
- .setMIFlag(MachineInstr::FrameSetup);
- if (X86FI->getRestoreBasePointer()) {
- // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain.
- unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
- addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
- FramePtr, true, X86FI->getRestoreBasePointerOffset())
- .addReg(StackPtr)
- .setMIFlag(MachineInstr::FrameSetup);
- }
- }
-
- if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
- // Mark end of stack pointer adjustment.
- if (!HasFP && NumBytes) {
- // Define the current CFA rule to use the provided offset.
- assert(StackSize);
- unsigned CFIIndex = MMI.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr,
- -StackSize + stackGrowth));
-
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- // Emit DWARF info specifying the offsets of the callee-saved registers.
- if (PushedRegs)
- emitCalleeSavedFrameMoves(MBB, MBBI, DL);
- }
-}
-
-void X86FrameLowering::emitEpilogue(MachineFunction &MF,
- MachineBasicBlock &MBB) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
- assert(MBBI != MBB.end() && "Returning block has no instructions");
- unsigned RetOpcode = MBBI->getOpcode();
- DebugLoc DL = MBBI->getDebugLoc();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- bool Is64Bit = STI.is64Bit();
- // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
- const bool Is64BitILP32 = STI.isTarget64BitILP32();
- bool UseLEA = STI.useLeaForSP();
- unsigned StackAlign = getStackAlignment();
- unsigned SlotSize = RegInfo->getSlotSize();
- unsigned FramePtr = RegInfo->getFrameRegister(MF);
- unsigned MachineFramePtr = Is64BitILP32 ?
- getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
- unsigned StackPtr = RegInfo->getStackRegister();
-
- bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
-
- switch (RetOpcode) {
- default:
- llvm_unreachable("Can only insert epilog into returning blocks");
- case X86::RETQ:
- case X86::RETL:
- case X86::RETIL:
- case X86::RETIQ:
- case X86::TCRETURNdi:
- case X86::TCRETURNri:
- case X86::TCRETURNmi:
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- case X86::EH_RETURN:
- case X86::EH_RETURN64:
- break; // These are ok
- }
-
- // Get the number of bytes to allocate from the FrameInfo.
- uint64_t StackSize = MFI->getStackSize();
- uint64_t MaxAlign = MFI->getMaxAlignment();
- unsigned CSSize = X86FI->getCalleeSavedFrameSize();
- uint64_t NumBytes = 0;
-
- // If we're forcing a stack realignment we can't rely on just the frame
- // info, we need to know the ABI stack alignment as well in case we
- // have a call out. Otherwise just make sure we have some alignment - we'll
- // go with the minimum.
- if (ForceStackAlign) {
- if (MFI->hasCalls())
- MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
- else
- MaxAlign = MaxAlign ? MaxAlign : 4;
- }
-
- if (hasFP(MF)) {
- // Calculate required stack adjustment.
- uint64_t FrameSize = StackSize - SlotSize;
- if (RegInfo->needsStackRealignment(MF)) {
- // Callee-saved registers were pushed on stack before the stack
- // was realigned.
- FrameSize -= CSSize;
- NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
- } else {
- NumBytes = FrameSize - CSSize;
- }
-
- // Pop EBP.
- BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
- } else {
- NumBytes = StackSize - CSSize;
- }
-
- // Skip the callee-saved pop instructions.
- while (MBBI != MBB.begin()) {
- MachineBasicBlock::iterator PI = std::prev(MBBI);
- unsigned Opc = PI->getOpcode();
-
- if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
- !PI->isTerminator())
- break;
-
- --MBBI;
- }
- MachineBasicBlock::iterator FirstCSPop = MBBI;
-
- DL = MBBI->getDebugLoc();
-
- // If there is an ADD32ri or SUB32ri of ESP immediately before this
- // instruction, merge the two instructions.
- if (NumBytes || MFI->hasVarSizedObjects())
- mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
-
- // If dynamic alloca is used, then reset esp to point to the last callee-saved
- // slot before popping them off! Same applies for the case, when stack was
- // realigned.
- if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
- if (RegInfo->needsStackRealignment(MF))
- MBBI = FirstCSPop;
- if (CSSize != 0) {
- unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
- addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
- FramePtr, false, -CSSize);
- --MBBI;
- } else {
- unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
- BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
- .addReg(FramePtr);
- --MBBI;
- }
- } else if (NumBytes) {
- // Adjust stack pointer back: ESP += numbytes.
- emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA,
- TII, *RegInfo);
- --MBBI;
- }
-
- // Windows unwinder will not invoke function's exception handler if IP is
- // either in prologue or in epilogue. This behavior causes a problem when a
- // call immediately precedes an epilogue, because the return address points
- // into the epilogue. To cope with that, we insert an epilogue marker here,
- // then replace it with a 'nop' if it ends up immediately after a CALL in the
- // final emitted code.
- if (NeedsWinEH)
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
-
- // We're returning from function via eh_return.
- if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &DestAddr = MBBI->getOperand(0);
- assert(DestAddr.isReg() && "Offset should be in register!");
- BuildMI(MBB, MBBI, DL,
- TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
- StackPtr).addReg(DestAddr.getReg());
- } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
- RetOpcode == X86::TCRETURNmi ||
- RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
- RetOpcode == X86::TCRETURNmi64) {
- bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
- // Tail call return: adjust the stack pointer and jump to callee.
- MBBI = MBB.getLastNonDebugInstr();
- MachineOperand &JumpTarget = MBBI->getOperand(0);
- MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
- assert(StackAdjust.isImm() && "Expecting immediate value.");
-
- // Adjust stack pointer.
- int StackAdj = StackAdjust.getImm();
- int MaxTCDelta = X86FI->getTCReturnAddrDelta();
- int Offset = 0;
- assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
-
- // Incoporate the retaddr area.
- Offset = StackAdj-MaxTCDelta;
- assert(Offset >= 0 && "Offset should never be negative");
-
- if (Offset) {
- // Check for possible merge with preceding ADD instruction.
- Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr,
- UseLEA, TII, *RegInfo);
- }
-
- // Jump to label or value in register.
- bool IsWin64 = STI.isTargetWin64();
- if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
- unsigned Op = (RetOpcode == X86::TCRETURNdi)
- ? X86::TAILJMPd
- : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64);
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
- if (JumpTarget.isGlobal())
- MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
- JumpTarget.getTargetFlags());
- else {
- assert(JumpTarget.isSymbol());
- MIB.addExternalSymbol(JumpTarget.getSymbolName(),
- JumpTarget.getTargetFlags());
- }
- } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
- unsigned Op = (RetOpcode == X86::TCRETURNmi)
- ? X86::TAILJMPm
- : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
- MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op));
- for (unsigned i = 0; i != 5; ++i)
- MIB.addOperand(MBBI->getOperand(i));
- } else if (RetOpcode == X86::TCRETURNri64) {
- BuildMI(MBB, MBBI, DL,
- TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
- .addReg(JumpTarget.getReg(), RegState::Kill);
- } else {
- BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
- addReg(JumpTarget.getReg(), RegState::Kill);
- }
-
- MachineInstr *NewMI = std::prev(MBBI);
- NewMI->copyImplicitOps(MF, MBBI);
-
- // Delete the pseudo instruction TCRETURN.
- MBB.erase(MBBI);
- } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL ||
- RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) &&
- (X86FI->getTCReturnAddrDelta() < 0)) {
- // Add the return addr area delta back since we are not tail calling.
- int delta = -1*X86FI->getTCReturnAddrDelta();
- MBBI = MBB.getLastNonDebugInstr();
-
- // Check for possible merge with preceding ADD instruction.
- delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII,
- *RegInfo);
- }
-}
-
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
- uint64_t StackSize = MFI->getStackSize();
-
- if (RegInfo->hasBasePointer(MF)) {
- assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
- if (FI < 0) {
- // Skip the saved EBP.
- return Offset + RegInfo->getSlotSize();
- } else {
- assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
- return Offset + StackSize;
- }
- } else if (RegInfo->needsStackRealignment(MF)) {
- if (FI < 0) {
- // Skip the saved EBP.
- return Offset + RegInfo->getSlotSize();
- } else {
- assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
- return Offset + StackSize;
- }
- // FIXME: Support tail calls
- } else {
- if (!hasFP(MF))
- return Offset + StackSize;
-
- // Skip the saved EBP.
- Offset += RegInfo->getSlotSize();
-
- // Skip the RETADDR move area
- const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
- if (TailCallReturnAddrDelta < 0)
- Offset -= TailCallReturnAddrDelta;
- }
-
- return Offset;
-}
-
-int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- // We can't calculate offset from frame pointer if the stack is realigned,
- // so enforce usage of stack/base pointer. The base pointer is used when we
- // have dynamic allocas in addition to dynamic realignment.
- if (RegInfo->hasBasePointer(MF))
- FrameReg = RegInfo->getBaseRegister();
- else if (RegInfo->needsStackRealignment(MF))
- FrameReg = RegInfo->getStackRegister();
- else
- FrameReg = RegInfo->getFrameRegister(MF);
- return getFrameIndexOffset(MF, FI);
-}
-
-// Simplified from getFrameIndexOffset keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
- const MachineFrameInfo *MFI = MF.getFrameInfo();
- // Does not include any dynamic realign.
- const uint64_t StackSize = MFI->getStackSize();
- {
-#ifndef NDEBUG
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
- // Note: LLVM arranges the stack as:
- // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
- // > "Stack Slots" (<--SP)
- // We can always address StackSlots from RSP. We can usually (unless
- // needsStackRealignment) address CSRs from RSP, but sometimes need to
- // address them from RBP. FixedObjects can be placed anywhere in the stack
- // frame depending on their specific requirements (i.e. we can actually
- // refer to arguments to the function which are stored in the *callers*
- // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
- // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
-
- assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
-
- // We don't handle tail calls, and shouldn't be seeing them
- // either.
- int TailCallReturnAddrDelta =
- MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
- assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
-#endif
- }
-
- // This is how the math works out:
- //
- // %rsp grows (i.e. gets lower) left to right. Each box below is
- // one word (eight bytes). Obj0 is the stack slot we're trying to
- // get to.
- //
- // ----------------------------------
- // | BP | Obj0 | Obj1 | ... | ObjN |
- // ----------------------------------
- // ^ ^ ^ ^
- // A B C E
- //
- // A is the incoming stack pointer.
- // (B - A) is the local area offset (-8 for x86-64) [1]
- // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
- //
- // |(E - B)| is the StackSize (absolute value, positive). For a
- // stack that grown down, this works out to be (B - E). [3]
- //
- // E is also the value of %rsp after stack has been set up, and we
- // want (C - E) -- the value we can add to %rsp to get to Obj0. Now
- // (C - E) == (C - A) - (B - A) + (B - E)
- // { Using [1], [2] and [3] above }
- // == getObjectOffset - LocalAreaOffset + StackSize
- //
-
- // Get the Offset from the StackPointer
- int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
-
- return Offset + StackSize;
-}
-// Simplified from getFrameIndexReference keeping only StackPointer cases
-int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
-
- assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
-
- FrameReg = RegInfo->getStackRegister();
- return getFrameIndexOffsetFromSP(MF, FI);
-}
-
-bool X86FrameLowering::assignCalleeSavedSpillSlots(
- MachineFunction &MF, const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- unsigned SlotSize = RegInfo->getSlotSize();
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-
- unsigned CalleeSavedFrameSize = 0;
- int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
-
- if (hasFP(MF)) {
- // emitPrologue always spills frame register the first thing.
- SpillSlotOffset -= SlotSize;
- MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
-
- // Since emitPrologue and emitEpilogue will handle spilling and restoring of
- // the frame register, we can delete it from CSI list and not have to worry
- // about avoiding it later.
- unsigned FPReg = RegInfo->getFrameRegister(MF);
- for (unsigned i = 0; i < CSI.size(); ++i) {
- if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
- CSI.erase(CSI.begin() + i);
- break;
- }
- }
- }
-
- // Assign slots for GPRs. It increases frame size.
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i - 1].getReg();
-
- if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
- continue;
-
- SpillSlotOffset -= SlotSize;
- CalleeSavedFrameSize += SlotSize;
-
- int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
- CSI[i - 1].setFrameIdx(SlotIndex);
- }
-
- X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
-
- // Assign slots for XMMs.
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i - 1].getReg();
- if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
- continue;
-
- const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
- // ensure alignment
- SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
- // spill into slot
- SpillSlotOffset -= RC->getSize();
- int SlotIndex =
- MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
- CSI[i - 1].setFrameIdx(SlotIndex);
- MFI->ensureMaxAlignment(RC->getAlignment());
- }
-
- return true;
-}
-
-bool X86FrameLowering::spillCalleeSavedRegisters(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- DebugLoc DL = MBB.findDebugLoc(MI);
-
- MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
-
- // Push GPRs. It increases frame size.
- unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i - 1].getReg();
-
- if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
- continue;
- // Add the callee-saved register as live-in. It's killed at the spill.
- MBB.addLiveIn(Reg);
-
- BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
- .setMIFlag(MachineInstr::FrameSetup);
- }
-
- // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
- // It can be done by spilling XMMs to stack frame.
- for (unsigned i = CSI.size(); i != 0; --i) {
- unsigned Reg = CSI[i-1].getReg();
- if (X86::GR64RegClass.contains(Reg) ||
- X86::GR32RegClass.contains(Reg))
- continue;
- // Add the callee-saved register as live-in. It's killed at the spill.
- MBB.addLiveIn(Reg);
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-
- TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
- TRI);
- --MI;
- MI->setFlag(MachineInstr::FrameSetup);
- ++MI;
- }
-
- return true;
-}
-
-bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
- if (CSI.empty())
- return false;
-
- DebugLoc DL = MBB.findDebugLoc(MI);
-
- MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
-
- // Reload XMMs from stack frame.
- for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
- unsigned Reg = CSI[i].getReg();
- if (X86::GR64RegClass.contains(Reg) ||
- X86::GR32RegClass.contains(Reg))
- continue;
-
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
- }
-
- // POP GPRs.
- unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
- for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
- unsigned Reg = CSI[i].getReg();
- if (!X86::GR64RegClass.contains(Reg) &&
- !X86::GR32RegClass.contains(Reg))
- continue;
-
- BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
- }
- return true;
-}
-
-void
-X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS) const {
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const X86RegisterInfo *RegInfo =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
- unsigned SlotSize = RegInfo->getSlotSize();
-
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
-
- if (TailCallReturnAddrDelta < 0) {
- // create RETURNADDR area
- // arg
- // arg
- // RETADDR
- // { ...
- // RETADDR area
- // ...
- // }
- // [EBP]
- MFI->CreateFixedObject(-TailCallReturnAddrDelta,
- TailCallReturnAddrDelta - SlotSize, true);
- }
-
- // Spill the BasePtr if it's used.
- if (RegInfo->hasBasePointer(MF))
- MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
-}
-
-static bool
-HasNestArgument(const MachineFunction *MF) {
- const Function *F = MF->getFunction();
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
- I != E; I++) {
- if (I->hasNestAttr())
- return true;
- }
- return false;
-}
-
-/// GetScratchRegister - Get a temp register for performing work in the
-/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
-/// and the properties of the function either one or two registers will be
-/// needed. Set primary to true for the first register, false for the second.
-static unsigned
-GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
- CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
-
- // Erlang stuff.
- if (CallingConvention == CallingConv::HiPE) {
- if (Is64Bit)
- return Primary ? X86::R14 : X86::R13;
- else
- return Primary ? X86::EBX : X86::EDI;
- }
-
- if (Is64Bit) {
- if (IsLP64)
- return Primary ? X86::R11 : X86::R12;
- else
- return Primary ? X86::R11D : X86::R12D;
- }
-
- bool IsNested = HasNestArgument(&MF);
-
- if (CallingConvention == CallingConv::X86_FastCall ||
- CallingConvention == CallingConv::Fast) {
- if (IsNested)
- report_fatal_error("Segmented stacks does not support fastcall with "
- "nested function.");
- return Primary ? X86::EAX : X86::ECX;
- }
- if (IsNested)
- return Primary ? X86::EDX : X86::EAX;
- return Primary ? X86::ECX : X86::EAX;
-}
-
-// The stack limit in the TCB is set to this many bytes above the actual stack
-// limit.
-static const uint64_t kSplitStackAvailable = 256;
-
-void
-X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
- MachineBasicBlock &prologueMBB = MF.front();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- uint64_t StackSize;
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- bool Is64Bit = STI.is64Bit();
- const bool IsLP64 = STI.isTarget64BitLP64();
- unsigned TlsReg, TlsOffset;
- DebugLoc DL;
-
- unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
- assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
- "Scratch register is live-in");
-
- if (MF.getFunction()->isVarArg())
- report_fatal_error("Segmented stacks do not support vararg functions.");
- if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
- !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
- !STI.isTargetDragonFly())
- report_fatal_error("Segmented stacks not supported on this platform.");
-
- // Eventually StackSize will be calculated by a link-time pass; which will
- // also decide whether checking code needs to be injected into this particular
- // prologue.
- StackSize = MFI->getStackSize();
-
- // Do not generate a prologue for functions with a stack of size zero
- if (StackSize == 0)
- return;
-
- MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
- MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- bool IsNested = false;
-
- // We need to know if the function has a nest argument only in 64 bit mode.
- if (Is64Bit)
- IsNested = HasNestArgument(&MF);
-
- // The MOV R10, RAX needs to be in a different block, since the RET we emit in
- // allocMBB needs to be last (terminating) instruction.
-
- for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
- e = prologueMBB.livein_end(); i != e; i++) {
- allocMBB->addLiveIn(*i);
- checkMBB->addLiveIn(*i);
- }
-
- if (IsNested)
- allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
-
- MF.push_front(allocMBB);
- MF.push_front(checkMBB);
-
- // When the frame size is less than 256 we just compare the stack
- // boundary directly to the value of the stack pointer, per gcc.
- bool CompareStackPointer = StackSize < kSplitStackAvailable;
-
- // Read the limit off the current stacklet off the stack_guard location.
- if (Is64Bit) {
- if (STI.isTargetLinux()) {
- TlsReg = X86::FS;
- TlsOffset = IsLP64 ? 0x70 : 0x40;
- } else if (STI.isTargetDarwin()) {
- TlsReg = X86::GS;
- TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
- } else if (STI.isTargetWin64()) {
- TlsReg = X86::GS;
- TlsOffset = 0x28; // pvArbitrary, reserved for application use
- } else if (STI.isTargetFreeBSD()) {
- TlsReg = X86::FS;
- TlsOffset = 0x18;
- } else if (STI.isTargetDragonFly()) {
- TlsReg = X86::FS;
- TlsOffset = 0x20; // use tls_tcb.tcb_segstack
- } else {
- report_fatal_error("Segmented stacks not supported on this platform.");
- }
-
- if (CompareStackPointer)
- ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
- else
- BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
- .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
-
- BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
- .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
- } else {
- if (STI.isTargetLinux()) {
- TlsReg = X86::GS;
- TlsOffset = 0x30;
- } else if (STI.isTargetDarwin()) {
- TlsReg = X86::GS;
- TlsOffset = 0x48 + 90*4;
- } else if (STI.isTargetWin32()) {
- TlsReg = X86::FS;
- TlsOffset = 0x14; // pvArbitrary, reserved for application use
- } else if (STI.isTargetDragonFly()) {
- TlsReg = X86::FS;
- TlsOffset = 0x10; // use tls_tcb.tcb_segstack
- } else if (STI.isTargetFreeBSD()) {
- report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
- } else {
- report_fatal_error("Segmented stacks not supported on this platform.");
- }
-
- if (CompareStackPointer)
- ScratchReg = X86::ESP;
- else
- BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
- .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
-
- if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
- STI.isTargetDragonFly()) {
- BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
- .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
- } else if (STI.isTargetDarwin()) {
-
- // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
- unsigned ScratchReg2;
- bool SaveScratch2;
- if (CompareStackPointer) {
- // The primary scratch register is available for holding the TLS offset.
- ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
- SaveScratch2 = false;
- } else {
- // Need to use a second register to hold the TLS offset
- ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
-
- // Unfortunately, with fastcc the second scratch register may hold an
- // argument.
- SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
- }
-
- // If Scratch2 is live-in then it needs to be saved.
- assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
- "Scratch register is live-in and not saved");
-
- if (SaveScratch2)
- BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
- .addReg(ScratchReg2, RegState::Kill);
-
- BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
- .addImm(TlsOffset);
- BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
- .addReg(ScratchReg)
- .addReg(ScratchReg2).addImm(1).addReg(0)
- .addImm(0)
- .addReg(TlsReg);
-
- if (SaveScratch2)
- BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
- }
- }
-
- // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
- // It jumps to normal execution of the function body.
- BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB);
-
- // On 32 bit we first push the arguments size and then the frame size. On 64
- // bit, we pass the stack frame size in r10 and the argument size in r11.
- if (Is64Bit) {
- // Functions with nested arguments use R10, so it needs to be saved across
- // the call to _morestack
-
- const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
- const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
- const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
- const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
- const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
-
- if (IsNested)
- BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
-
- BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
- .addImm(StackSize);
- BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
- .addImm(X86FI->getArgumentStackSize());
- MF.getRegInfo().setPhysRegUsed(Reg10);
- MF.getRegInfo().setPhysRegUsed(Reg11);
- } else {
- BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
- .addImm(X86FI->getArgumentStackSize());
- BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
- .addImm(StackSize);
- }
-
- // __morestack is in libgcc
- if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
- // Under the large code model, we cannot assume that __morestack lives
- // within 2^31 bytes of the call site, so we cannot use pc-relative
- // addressing. We cannot perform the call via a temporary register,
- // as the rax register may be used to store the static chain, and all
- // other suitable registers may be either callee-save or used for
- // parameter passing. We cannot use the stack at this point either
- // because __morestack manipulates the stack directly.
- //
- // To avoid these issues, perform an indirect call via a read-only memory
- // location containing the address.
- //
- // This solution is not perfect, as it assumes that the .rodata section
- // is laid out within 2^31 bytes of each function body, but this seems
- // to be sufficient for JIT.
- BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
- .addReg(X86::RIP)
- .addImm(0)
- .addReg(0)
- .addExternalSymbol("__morestack_addr")
- .addReg(0);
- MF.getMMI().setUsesMorestackAddr(true);
- } else {
- if (Is64Bit)
- BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack");
- else
- BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
- .addExternalSymbol("__morestack");
- }
-
- if (IsNested)
- BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
- else
- BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
-
- allocMBB->addSuccessor(&prologueMBB);
-
- checkMBB->addSuccessor(allocMBB);
- checkMBB->addSuccessor(&prologueMBB);
-
-#ifdef XDEBUG
- MF.verify();
-#endif
-}
-
-/// Erlang programs may need a special prologue to handle the stack size they
-/// might need at runtime. That is because Erlang/OTP does not implement a C
-/// stack but uses a custom implementation of hybrid stack/heap architecture.
-/// (for more information see Eric Stenman's Ph.D. thesis:
-/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
-///
-/// CheckStack:
-/// temp0 = sp - MaxStack
-/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
-/// OldStart:
-/// ...
-/// IncStack:
-/// call inc_stack # doubles the stack space
-/// temp0 = sp - MaxStack
-/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
-void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- MachineFrameInfo *MFI = MF.getFrameInfo();
- const unsigned SlotSize =
- static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo())
- ->getSlotSize();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- const bool Is64Bit = STI.is64Bit();
- const bool IsLP64 = STI.isTarget64BitLP64();
- DebugLoc DL;
- // HiPE-specific values
- const unsigned HipeLeafWords = 24;
- const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
- const unsigned Guaranteed = HipeLeafWords * SlotSize;
- unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
- MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
- unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
-
- assert(STI.isTargetLinux() &&
- "HiPE prologue is only supported on Linux operating systems.");
-
- // Compute the largest caller's frame that is needed to fit the callees'
- // frames. This 'MaxStack' is computed from:
- //
- // a) the fixed frame size, which is the space needed for all spilled temps,
- // b) outgoing on-stack parameter areas, and
- // c) the minimum stack space this function needs to make available for the
- // functions it calls (a tunable ABI property).
- if (MFI->hasCalls()) {
- unsigned MoreStackForCalls = 0;
-
- for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
- MBBI != MBBE; ++MBBI)
- for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
- MI != ME; ++MI) {
- if (!MI->isCall())
- continue;
-
- // Get callee operand.
- const MachineOperand &MO = MI->getOperand(0);
-
- // Only take account of global function calls (no closures etc.).
- if (!MO.isGlobal())
- continue;
-
- const Function *F = dyn_cast<Function>(MO.getGlobal());
- if (!F)
- continue;
-
- // Do not update 'MaxStack' for primitive and built-in functions
- // (encoded with names either starting with "erlang."/"bif_" or not
- // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
- // "_", such as the BIF "suspend_0") as they are executed on another
- // stack.
- if (F->getName().find("erlang.") != StringRef::npos ||
- F->getName().find("bif_") != StringRef::npos ||
- F->getName().find_first_of("._") == StringRef::npos)
- continue;
-
- unsigned CalleeStkArity =
- F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
- if (HipeLeafWords - 1 > CalleeStkArity)
- MoreStackForCalls = std::max(MoreStackForCalls,
- (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
- }
- MaxStack += MoreStackForCalls;
- }
-
- // If the stack frame needed is larger than the guaranteed then runtime checks
- // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
- if (MaxStack > Guaranteed) {
- MachineBasicBlock &prologueMBB = MF.front();
- MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
- MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
-
- for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
- E = prologueMBB.livein_end(); I != E; I++) {
- stackCheckMBB->addLiveIn(*I);
- incStackMBB->addLiveIn(*I);
- }
-
- MF.push_front(incStackMBB);
- MF.push_front(stackCheckMBB);
-
- unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
- unsigned LEAop, CMPop, CALLop;
- if (Is64Bit) {
- SPReg = X86::RSP;
- PReg = X86::RBP;
- LEAop = X86::LEA64r;
- CMPop = X86::CMP64rm;
- CALLop = X86::CALL64pcrel32;
- SPLimitOffset = 0x90;
- } else {
- SPReg = X86::ESP;
- PReg = X86::EBP;
- LEAop = X86::LEA32r;
- CMPop = X86::CMP32rm;
- CALLop = X86::CALLpcrel32;
- SPLimitOffset = 0x4c;
- }
-
- ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
- assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
- "HiPE prologue scratch register is live-in");
-
- // Create new MBB for StackCheck:
- addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
- SPReg, false, -MaxStack);
- // SPLimitOffset is in a fixed heap location (pointed by BP).
- addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
- .addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB);
-
- // Create new MBB for IncStack:
- BuildMI(incStackMBB, DL, TII.get(CALLop)).
- addExternalSymbol("inc_stack_0");
- addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
- SPReg, false, -MaxStack);
- addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
- .addReg(ScratchReg), PReg, false, SPLimitOffset);
- BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
-
- stackCheckMBB->addSuccessor(&prologueMBB, 99);
- stackCheckMBB->addSuccessor(incStackMBB, 1);
- incStackMBB->addSuccessor(&prologueMBB, 99);
- incStackMBB->addSuccessor(incStackMBB, 1);
- }
-#ifdef XDEBUG
- MF.verify();
-#endif
-}
-
-void X86FrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- unsigned StackPtr = RegInfo.getStackRegister();
- bool reserveCallFrame = hasReservedCallFrame(MF);
- int Opcode = I->getOpcode();
- bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
- const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
- bool IsLP64 = STI.isTarget64BitLP64();
- DebugLoc DL = I->getDebugLoc();
- uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
- uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
- I = MBB.erase(I);
-
- if (!reserveCallFrame) {
- // If the stack pointer can be changed after prologue, turn the
- // adjcallstackup instruction into a 'sub ESP, <amt>' and the
- // adjcallstackdown instruction into 'add ESP, <amt>'
- if (Amount == 0)
- return;
-
- // We need to keep the stack aligned properly. To do this, we round the
- // amount of space needed for the outgoing arguments up to the next
- // alignment boundary.
- unsigned StackAlign = MF.getTarget()
- .getSubtargetImpl()
- ->getFrameLowering()
- ->getStackAlignment();
- Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
-
- MachineInstr *New = nullptr;
-
- // Factor out the amount that gets handled inside the sequence
- // (Pushes of argument for frame setup, callee pops for frame destroy)
- Amount -= InternalAmt;
-
- if (Amount) {
- if (Opcode == TII.getCallFrameSetupOpcode()) {
- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
- .addReg(StackPtr).addImm(Amount);
- } else {
- assert(Opcode == TII.getCallFrameDestroyOpcode());
-
- unsigned Opc = getADDriOpcode(IsLP64, Amount);
- New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(Amount);
- }
- }
-
- if (New) {
- // The EFLAGS implicit def is dead.
- New->getOperand(3).setIsDead();
-
- // Replace the pseudo instruction with a new instruction.
- MBB.insert(I, New);
- }
-
- return;
- }
-
- if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
- // If we are performing frame pointer elimination and if the callee pops
- // something off the stack pointer, add it back. We do this until we have
- // more advanced stack pointer tracking ability.
- unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
- MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(InternalAmt);
-
- // The EFLAGS implicit def is dead.
- New->getOperand(3).setIsDead();
-
- // We are not tracking the stack pointer adjustment by the callee, so make
- // sure we restore the stack pointer immediately after the call, there may
- // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
- MachineBasicBlock::iterator B = MBB.begin();
- while (I != B && !std::prev(I)->isCall())
- --I;
- MBB.insert(I, New);
- }
-}
-
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" +#include <cstdlib> + +using namespace llvm; + +// FIXME: completely move here. +extern cl::opt<bool> ForceStackAlign; + +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineModuleInfo &MMI = MF.getMMI(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + + return (MF.getTarget().Options.DisableFramePointerElim(MF) || + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MMI.callsUnwindInit() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint()); +} + +static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::ADD64ri8; + return X86::ADD64ri32; + } else { + if (isInt<8>(Imm)) + return X86::ADD32ri8; + return X86::ADD32ri; + } +} + +static unsigned getSUBrrOpcode(unsigned isLP64) { + return isLP64 ? X86::SUB64rr : X86::SUB32rr; +} + +static unsigned getADDrrOpcode(unsigned isLP64) { + return isLP64 ? X86::ADD64rr : X86::ADD32rr; +} + +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::AND64ri8; + return X86::AND64ri32; + } + if (isInt<8>(Imm)) + return X86::AND32ri8; + return X86::AND32ri; +} + +static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) { + // We don't support LP64 for now. + assert(!IsLP64); + + if (MO.isImm() && isInt<8>(MO.getImm())) + return X86::PUSH32i8; + + return X86::PUSHi32;; +} + +static unsigned getLEArOpcode(unsigned IsLP64) { + return IsLP64 ? X86::LEA64r : X86::LEA32r; +} + +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live +/// when it reaches the "return" instruction. We can then pop a stack object +/// to this register without worry about clobbering it. +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetRegisterInfo &TRI, + bool Is64Bit) { + const MachineFunction *MF = MBB.getParent(); + const Function *F = MF->getFunction(); + if (!F || MF->getMMI().callsEHReturn()) + return 0; + + static const uint16_t CallerSavedRegs32Bit[] = { + X86::EAX, X86::EDX, X86::ECX, 0 + }; + + static const uint16_t CallerSavedRegs64Bit[] = { + X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, + X86::R8, X86::R9, X86::R10, X86::R11, 0 + }; + + unsigned Opc = MBBI->getOpcode(); + switch (Opc) { + default: return 0; + case X86::RETL: + case X86::RETQ: + case X86::RETIL: + case X86::RETIQ: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<uint16_t, 8> Uses; + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MBBI->getOperand(i); + if (!MO.isReg() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); + } + + const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; + for (; *CS; ++CS) + if (!Uses.count(*CS)) + return *CS; + } + } + + return 0; +} + +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} + +/// emitSPUpdate - Emit a series of instructions to increment / decrement the +/// stack pointer by a constant value. +static +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, int64_t NumBytes, + bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, + const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { + bool isSub = NumBytes < 0; + uint64_t Offset = isSub ? -NumBytes : NumBytes; + unsigned Opc; + if (UseLEA) + Opc = getLEArOpcode(Is64BitStackPtr); + else + Opc = isSub + ? getSUBriOpcode(Is64BitStackPtr, Offset) + : getADDriOpcode(Is64BitStackPtr, Offset); + + uint64_t Chunk = (1LL << 31) - 1; + DebugLoc DL = MBB.findDebugLoc(MBBI); + + while (Offset) { + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + + if (isSub && !isEAXLiveIn(*MBB.getParent())) + Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + + if (Reg) { + Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; + BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) + .addImm(Offset); + Opc = isSub + ? getSUBrrOpcode(Is64BitTarget) + : getADDrrOpcode(Is64BitTarget); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset = 0; + continue; + } + } + + uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (ThisVal == (Is64BitTarget ? 8 : 4)) { + // Use push / pop instead. + unsigned Reg = isSub + ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + if (Reg) { + Opc = isSub + ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) + : (Is64BitTarget ? X86::POP64r : X86::POP32r); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + Offset -= ThisVal; + continue; + } + } + + MachineInstr *MI = nullptr; + + if (UseLEA) { + MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + StackPtr, false, isSub ? -ThisVal : ThisVal); + } else { + MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ThisVal); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + + Offset -= ThisVal; + } +} + +/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. +static +void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = nullptr) { + if (MBBI == MBB.begin()) return; + + MachineBasicBlock::iterator PI = std::prev(MBBI); + unsigned Opc = PI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += PI->getOperand(2).getImm(); + MBB.erase(PI); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= PI->getOperand(2).getImm(); + MBB.erase(PI); + } +} + +/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower +/// iterator. +static +void mergeSPUpdatesDown(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = nullptr) { + // FIXME: THIS ISN'T RUN!!! + return; + + if (MBBI == MBB.end()) return; + + MachineBasicBlock::iterator NI = std::next(MBBI); + if (NI == MBB.end()) return; + + unsigned Opc = NI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } +} + +/// mergeSPUpdates - Checks the instruction before/after the passed +/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and +/// the stack adjustment is returned as a positive value for ADD/LEA and a +/// negative for SUB. +static int mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, unsigned StackPtr, + bool doMergeWithPrevious) { + if ((doMergeWithPrevious && MBBI == MBB.begin()) || + (!doMergeWithPrevious && MBBI == MBB.end())) + return 0; + + MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; + MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr + : std::next(MBBI); + unsigned Opc = PI->getOpcode(); + int Offset = 0; + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr){ + Offset += PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + Offset -= PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } + + return Offset; +} + +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) return; + + // Calculate offsets. + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = + MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, + Offset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } +} + +/// usesTheStack - This function checks if any of the users of EFLAGS +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has +/// to use the stack, and if we don't adjust the stack we clobber the first +/// frame index. +/// See X86InstrInfo::copyPhysReg. +static bool usesTheStack(const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + for (MachineRegisterInfo::reg_instr_iterator + ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); + ri != re; ++ri) + if (ri->isCopy()) + return true; + + return false; +} + +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool Is64Bit = STI.is64Bit(); + bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + + unsigned CallOp; + if (Is64Bit) + CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; + else + CallOp = X86::CALLpcrel32; + + const char *Symbol; + if (Is64Bit) { + if (STI.isTargetCygMing()) { + Symbol = "___chkstk_ms"; + } else { + Symbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + Symbol = "_alloca"; + else + Symbol = "_chkstk"; + + MachineInstrBuilder CI; + + // All current stack probes take AX and SP as input, clobber flags, and + // preserve all registers. x86_64 probes leave RSP unmodified. + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // For the large code model, we have to call through a register. Use R11, + // as it is scratch in all supported calling conventions. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) + .addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + } else { + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + } + + unsigned AX = Is64Bit ? X86::RAX : X86::EAX; + unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + CI.addReg(AX, RegState::Implicit) + .addReg(SP, RegState::Implicit) + .addReg(AX, RegState::Define | RegState::Implicit) + .addReg(SP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themselves. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } +} + +/// emitPrologue - Push callee-saved registers onto the stack, which +/// automatically adjust the stack pointer. Adjust the stack pointer to allocate +/// space for local variables. Also emit labels used by the exception handler to +/// generate the exception handling frames. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq %<reg> + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg %<reg> + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %<xmm reg>, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %<xmm reg>, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + [if needs to restore base pointer] + mov %rsp, -MMM(%rbp) + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + +void X86FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool HasFP = hasFP(MF); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool Is64Bit = STI.is64Bit(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + bool IsWin64 = STI.isTargetWin64(); + // Not necessarily synonymous with IsWin64. + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + bool UseLEA = STI.useLeaForSP(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + unsigned StackPtr = RegInfo->getStackRegister(); + unsigned BasePtr = RegInfo->getBaseRegister(); + DebugLoc DL; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum SlotSize. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + + // Add RETADDR move area to callee saved frame size. + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + X86FI->setCalleeSavedFrameSize( + X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + if (Fn->hasFnAttribute("stack-probe-size")) + Fn->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf + // function, and use up to 128 bytes of stack space, don't have a frame + // pointer, calls, or dynamic alloca then we do not need to adjust the + // stack pointer (we fit in the Red Zone). We also check that we don't + // push and pop from the stack. + if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoRedZone) && + !RegInfo->needsStackRealignment(MF) && + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack + uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); + if (HasFP) MinSize += SlotSize; + StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); + MFI->setStackSize(StackSize); + } + + // Insert stack pointer adjustment for later moving of return addr. Only + // applies to tail call optimized functions where the callee argument stack + // size is bigger than the callers. + if (TailCallReturnAddrDelta < 0) { + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), + StackPtr) + .addReg(StackPtr) + .addImm(-TailCallReturnAddrDelta) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + + // Mapping for machine moves: + // + // DST: VirtualFP AND + // SRC: VirtualFP => DW_CFA_def_cfa_offset + // ELSE => DW_CFA_def_cfa + // + // SRC: VirtualFP AND + // DST: Register => DW_CFA_def_cfa_register + // + // ELSE + // OFFSET < 0 => DW_CFA_offset_extended_sf + // REG < 64 => DW_CFA_offset + Reg + // ELSE => DW_CFA_offset_extended + + uint64_t NumBytes = 0; + int stackGrowth = -SlotSize; + + if (HasFP) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers are pushed on stack before the stack + // is realigned. + FrameSize -= X86FI->getCalleeSavedFrameSize(); + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + } + + // Get the offset of the stack slot for the EBP register, which is + // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. + // Update the frame offset adjustment. + MFI->setOffsetAdjustment(-NumBytes); + + // Save EBP/RBP into the appropriate stack slot. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(MachineFramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsDwarfCFI) { + // Mark the place where EBP/RBP was saved. + // Define the current CFA rule to use the provided offset. + assert(StackSize); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Change the rule for the FramePtr to be an "offset" rule. + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, + DwarfFramePtr, 2 * stackGrowth)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsDwarfCFI) { + // Mark effective beginning of when frame pointer becomes valid. + // Define the current CFA to use the EBP/RBP register. + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + // Mark the FramePtr as live-in in every block. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + I->addLiveIn(MachineFramePtr); + } else { + NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + } + + // Skip the callee-saved push instructions. + bool PushedRegs = false; + int StackOffset = 2 * stackGrowth; + + while (MBBI != MBB.end() && + (MBBI->getOpcode() == X86::PUSH32r || + MBBI->getOpcode() == X86::PUSH64r)) { + PushedRegs = true; + unsigned Reg = MBBI->getOperand(0).getReg(); + ++MBBI; + + if (!HasFP && NeedsDwarfCFI) { + // Mark callee-saved push instruction. + // Define the current CFA rule to use the provided offset. + assert(StackSize); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + StackOffset += stackGrowth; + } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } + } + + // Realign stack after we pushed callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + if (RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } + + // If there is an SUB32ri of ESP immediately before this instruction, merge + // the two. This can be the case when tail call elimination is enabled and + // the callee has more arguments then the caller. + NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); + + // If there is an ADD32ri or SUB32ri of ESP immediately after this + // instruction, merge the two instructions. + mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); + + // Adjust stack pointer: ESP -= numbytes. + + // Windows and cygwin/mingw require a prologue helper routine when allocating + // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw + // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the + // stack and adjust the stack pointer in one go. The 64-bit version of + // __chkstk is only responsible for probing the stack. The 64-bit prologue is + // responsible for adjusting the stack pointer. Touching the stack at 4K + // increments is necessary to ensure that the guard pages used by the OS + // virtual memory manager are allocated in correct sequence. + if (NumBytes >= StackProbeSize && UseStackProbe) { + // Check whether EAX is livein for this function. + bool isEAXAlive = isEAXLiveIn(MF); + + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; + + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbeCall(MF, MBB, MBBI, DL); + + // Apply the frame setup flag to all inserted instrs. + for (; SetRAX != MBBI; ++SetRAX) + SetRAX->setFlag(MachineInstr::FrameSetup); + + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } + } else if (NumBytes) { + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, + UseLEA, TII, *RegInfo); + } + + int SEHFrameOffset = 0; + if (NeedsWinEH) { + if (HasFP) { + // We need to set frame base offset low enough such that all saved + // register offsets would be positive relative to it, but we can't + // just use NumBytes, because .seh_setframe offset must be <=240. + // So we pretend to have only allocated enough space to spill the + // non-volatile registers. + // We don't care about the rest of stack allocation, because unwinder + // will restore SP to (BP - SEHFrameOffset) + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + int offset = MFI->getObjectOffset(Info.getFrameIdx()); + SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); + } + SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant + + // This only needs to account for XMM spill slots, GPR slots + // are covered by the .seh_pushreg's emitted above. + unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); + if (Size) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(Size) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // SP will be the base register for restoring XMMs + if (NumBytes) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + + // Skip the rest of register spilling code + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + // Emit SEH info for non-GPRs + if (NeedsWinEH) { + for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { + unsigned Reg = Info.getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); + + int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) { + // Update the base pointer with the current stack pointer. + unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; + BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + if (X86FI->getRestoreBasePointer()) { + // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } + } + + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { + // Mark end of stack pointer adjustment. + if (!HasFP && NumBytes) { + // Define the current CFA rule to use the provided offset. + assert(StackSize); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, + -StackSize + stackGrowth)); + + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + // Emit DWARF info specifying the offsets of the callee-saved registers. + if (PushedRegs) + emitCalleeSavedFrameMoves(MBB, MBBI, DL); + } +} + +void X86FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI != MBB.end() && "Returning block has no instructions"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc DL = MBBI->getDebugLoc(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool Is64Bit = STI.is64Bit(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + const bool Is64BitILP32 = STI.isTarget64BitILP32(); + bool UseLEA = STI.useLeaForSP(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned MachineFramePtr = Is64BitILP32 ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; + unsigned StackPtr = RegInfo->getStackRegister(); + + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); + + switch (RetOpcode) { + default: + llvm_unreachable("Can only insert epilog into returning blocks"); + case X86::RETQ: + case X86::RETL: + case X86::RETIL: + case X86::RETIQ: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: + break; // These are ok + } + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t StackSize = MFI->getStackSize(); + uint64_t MaxAlign = MFI->getMaxAlignment(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t NumBytes = 0; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else + MaxAlign = MaxAlign ? MaxAlign : 4; + } + + if (hasFP(MF)) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers were pushed on stack before the stack + // was realigned. + FrameSize -= CSSize; + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - CSSize; + } + + // Pop EBP. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); + } else { + NumBytes = StackSize - CSSize; + } + + // Skip the callee-saved pop instructions. + while (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = std::prev(MBBI); + unsigned Opc = PI->getOpcode(); + + if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && + !PI->isTerminator()) + break; + + --MBBI; + } + MachineBasicBlock::iterator FirstCSPop = MBBI; + + DL = MBBI->getDebugLoc(); + + // If there is an ADD32ri or SUB32ri of ESP immediately before this + // instruction, merge the two instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + + // If dynamic alloca is used, then reset esp to point to the last callee-saved + // slot before popping them off! Same applies for the case, when stack was + // realigned. + if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + if (RegInfo->needsStackRealignment(MF)) + MBBI = FirstCSPop; + if (CSSize != 0) { + unsigned Opc = getLEArOpcode(Uses64BitFramePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + FramePtr, false, -CSSize); + --MBBI; + } else { + unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(FramePtr); + --MBBI; + } + } else if (NumBytes) { + // Adjust stack pointer back: ESP += numbytes. + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, + TII, *RegInfo); + --MBBI; + } + + // Windows unwinder will not invoke function's exception handler if IP is + // either in prologue or in epilogue. This behavior causes a problem when a + // call immediately precedes an epilogue, because the return address points + // into the epilogue. To cope with that, we insert an epilogue marker here, + // then replace it with a 'nop' if it ends up immediately after a CALL in the + // final emitted code. + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + + // We're returning from function via eh_return. + if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), + StackPtr).addReg(DestAddr.getReg()); + } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || + RetOpcode == X86::TCRETURNmi || + RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || + RetOpcode == X86::TCRETURNmi64) { + bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int MaxTCDelta = X86FI->getTCReturnAddrDelta(); + int Offset = 0; + assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); + + // Incoporate the retaddr area. + Offset = StackAdj-MaxTCDelta; + assert(Offset >= 0 && "Offset should never be negative"); + + if (Offset) { + // Check for possible merge with preceding ADD instruction. + Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, + UseLEA, TII, *RegInfo); + } + + // Jump to label or value in register. + bool IsWin64 = STI.isTargetWin64(); + if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { + unsigned Op = (RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd + : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { + unsigned Op = (RetOpcode == X86::TCRETURNmi) + ? X86::TAILJMPm + : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); + for (unsigned i = 0; i != 5; ++i) + MIB.addOperand(MBBI->getOperand(i)); + } else if (RetOpcode == X86::TCRETURNri64) { + BuildMI(MBB, MBBI, DL, + TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + NewMI->copyImplicitOps(MF, MBBI); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL || + RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) && + (X86FI->getTCReturnAddrDelta() < 0)) { + // Add the return addr area delta back since we are not tail calling. + int delta = -1*X86FI->getTCReturnAddrDelta(); + MBBI = MBB.getLastNonDebugInstr(); + + // Check for possible merge with preceding ADD instruction. + delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, + *RegInfo); + } +} + +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + uint64_t StackSize = MFI->getStackSize(); + + if (RegInfo->hasBasePointer(MF)) { + assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + } else if (RegInfo->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + // FIXME: Support tail calls + } else { + if (!hasFP(MF)) + return Offset + StackSize; + + // Skip the saved EBP. + Offset += RegInfo->getSlotSize(); + + // Skip the RETADDR move area + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + Offset -= TailCallReturnAddrDelta; + } + + return Offset; +} + +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else if (RegInfo->needsStackRealignment(MF)) + FrameReg = RegInfo->getStackRegister(); + else + FrameReg = RegInfo->getFrameRegister(MF); + return getFrameIndexOffset(MF, FI); +} + +// Simplified from getFrameIndexOffset keeping only StackPointer cases +int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Does not include any dynamic realign. + const uint64_t StackSize = MFI->getStackSize(); + { +#ifndef NDEBUG + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + // Note: LLVM arranges the stack as: + // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) + // > "Stack Slots" (<--SP) + // We can always address StackSlots from RSP. We can usually (unless + // needsStackRealignment) address CSRs from RSP, but sometimes need to + // address them from RBP. FixedObjects can be placed anywhere in the stack + // frame depending on their specific requirements (i.e. we can actually + // refer to arguments to the function which are stored in the *callers* + // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs + // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + // We don't handle tail calls, and shouldn't be seeing them + // either. + int TailCallReturnAddrDelta = + MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); + assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); +#endif + } + + // This is how the math works out: + // + // %rsp grows (i.e. gets lower) left to right. Each box below is + // one word (eight bytes). Obj0 is the stack slot we're trying to + // get to. + // + // ---------------------------------- + // | BP | Obj0 | Obj1 | ... | ObjN | + // ---------------------------------- + // ^ ^ ^ ^ + // A B C E + // + // A is the incoming stack pointer. + // (B - A) is the local area offset (-8 for x86-64) [1] + // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // + // |(E - B)| is the StackSize (absolute value, positive). For a + // stack that grown down, this works out to be (B - E). [3] + // + // E is also the value of %rsp after stack has been set up, and we + // want (C - E) -- the value we can add to %rsp to get to Obj0. Now + // (C - E) == (C - A) - (B - A) + (B - E) + // { Using [1], [2] and [3] above } + // == getObjectOffset - LocalAreaOffset + StackSize + // + + // Get the Offset from the StackPointer + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + + return Offset + StackSize; +} +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + FrameReg = RegInfo->getStackRegister(); + return getFrameIndexOffsetFromSP(MF, FI); +} + +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); + + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = RegInfo->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + + return true; +} + +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + + // Push GPRs. It increases frame size. + unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; + } + + return true; +} + +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + } + + // POP GPRs. + unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + } + return true; +} + +void +X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI->CreateFixedObject(-TailCallReturnAddrDelta, + TailCallReturnAddrDelta - SlotSize, true); + } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); +} + +static bool +HasNestArgument(const MachineFunction *MF) { + const Function *F = MF->getFunction(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) { + if (I->hasNestAttr()) + return true; + } + return false; +} + +/// GetScratchRegister - Get a temp register for performing work in the +/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform +/// and the properties of the function either one or two registers will be +/// needed. Set primary to true for the first register, false for the second. +static unsigned +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + + // Erlang stuff. + if (CallingConvention == CallingConv::HiPE) { + if (Is64Bit) + return Primary ? X86::R14 : X86::R13; + else + return Primary ? X86::EBX : X86::EDI; + } + + if (Is64Bit) { + if (IsLP64) + return Primary ? X86::R11 : X86::R12; + else + return Primary ? X86::R11D : X86::R12D; + } + + bool IsNested = HasNestArgument(&MF); + + if (CallingConvention == CallingConv::X86_FastCall || + CallingConvention == CallingConv::Fast) { + if (IsNested) + report_fatal_error("Segmented stacks does not support fastcall with " + "nested function."); + return Primary ? X86::EAX : X86::ECX; + } + if (IsNested) + return Primary ? X86::EDX : X86::EAX; + return Primary ? X86::ECX : X86::EAX; +} + +// The stack limit in the TCB is set to this many bytes above the actual stack +// limit. +static const uint64_t kSplitStackAvailable = 256; + +void +X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { + MachineBasicBlock &prologueMBB = MF.front(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + uint64_t StackSize; + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); + unsigned TlsReg, TlsOffset; + DebugLoc DL; + + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "Scratch register is live-in"); + + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && + !STI.isTargetWin64() && !STI.isTargetFreeBSD() && + !STI.isTargetDragonFly()) + report_fatal_error("Segmented stacks not supported on this platform."); + + // Eventually StackSize will be calculated by a link-time pass; which will + // also decide whether checking code needs to be injected into this particular + // prologue. + StackSize = MFI->getStackSize(); + + // Do not generate a prologue for functions with a stack of size zero + if (StackSize == 0) + return; + + MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool IsNested = false; + + // We need to know if the function has a nest argument only in 64 bit mode. + if (Is64Bit) + IsNested = HasNestArgument(&MF); + + // The MOV R10, RAX needs to be in a different block, since the RET we emit in + // allocMBB needs to be last (terminating) instruction. + + for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), + e = prologueMBB.livein_end(); i != e; i++) { + allocMBB->addLiveIn(*i); + checkMBB->addLiveIn(*i); + } + + if (IsNested) + allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); + + MF.push_front(allocMBB); + MF.push_front(checkMBB); + + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = StackSize < kSplitStackAvailable; + + // Read the limit off the current stacklet off the stack_guard location. + if (Is64Bit) { + if (STI.isTargetLinux()) { + TlsReg = X86::FS; + TlsOffset = IsLP64 ? 0x70 : 0x40; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. + } else if (STI.isTargetWin64()) { + TlsReg = X86::GS; + TlsOffset = 0x28; // pvArbitrary, reserved for application use + } else if (STI.isTargetFreeBSD()) { + TlsReg = X86::FS; + TlsOffset = 0x18; + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x20; // use tls_tcb.tcb_segstack + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = IsLP64 ? X86::RSP : X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else { + if (STI.isTargetLinux()) { + TlsReg = X86::GS; + TlsOffset = 0x30; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x48 + 90*4; + } else if (STI.isTargetWin32()) { + TlsReg = X86::FS; + TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x10; // use tls_tcb.tcb_segstack + } else if (STI.isTargetFreeBSD()) { + report_fatal_error("Segmented stacks not supported on FreeBSD i386."); + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || + STI.isTargetDragonFly()) { + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else if (STI.isTargetDarwin()) { + + // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. + unsigned ScratchReg2; + bool SaveScratch2; + if (CompareStackPointer) { + // The primary scratch register is available for holding the TLS offset. + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); + SaveScratch2 = false; + } else { + // Need to use a second register to hold the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); + + // Unfortunately, with fastcc the second scratch register may hold an + // argument. + SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); + } + + // If Scratch2 is live-in then it needs to be saved. + assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && + "Scratch register is live-in and not saved"); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) + .addReg(ScratchReg2, RegState::Kill); + + BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) + .addImm(TlsOffset); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(ScratchReg2).addImm(1).addReg(0) + .addImm(0) + .addReg(TlsReg); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); + } + } + + // This jump is taken if SP >= (Stacklet Limit + Stack Space required). + // It jumps to normal execution of the function body. + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); + + // On 32 bit we first push the arguments size and then the frame size. On 64 + // bit, we pass the stack frame size in r10 and the argument size in r11. + if (Is64Bit) { + // Functions with nested arguments use R10, so it needs to be saved across + // the call to _morestack + + const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; + const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; + const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; + const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; + const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); + + BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) + .addImm(StackSize); + BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) + .addImm(X86FI->getArgumentStackSize()); + MF.getRegInfo().setPhysRegUsed(Reg10); + MF.getRegInfo().setPhysRegUsed(Reg11); + } else { + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(StackSize); + } + + // __morestack is in libgcc + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // Under the large code model, we cannot assume that __morestack lives + // within 2^31 bytes of the call site, so we cannot use pc-relative + // addressing. We cannot perform the call via a temporary register, + // as the rax register may be used to store the static chain, and all + // other suitable registers may be either callee-save or used for + // parameter passing. We cannot use the stack at this point either + // because __morestack manipulates the stack directly. + // + // To avoid these issues, perform an indirect call via a read-only memory + // location containing the address. + // + // This solution is not perfect, as it assumes that the .rodata section + // is laid out within 2^31 bytes of each function body, but this seems + // to be sufficient for JIT. + BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addExternalSymbol("__morestack_addr") + .addReg(0); + MF.getMMI().setUsesMorestackAddr(true); + } else { + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + } + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); + else + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); + + allocMBB->addSuccessor(&prologueMBB); + + checkMBB->addSuccessor(allocMBB); + checkMBB->addSuccessor(&prologueMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} + +/// Erlang programs may need a special prologue to handle the stack size they +/// might need at runtime. That is because Erlang/OTP does not implement a C +/// stack but uses a custom implementation of hybrid stack/heap architecture. +/// (for more information see Eric Stenman's Ph.D. thesis: +/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) +/// +/// CheckStack: +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +/// OldStart: +/// ... +/// IncStack: +/// call inc_stack # doubles the stack space +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const unsigned SlotSize = + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) + ->getSlotSize(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + const bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); + DebugLoc DL; + // HiPE-specific values + const unsigned HipeLeafWords = 24; + const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; + const unsigned Guaranteed = HipeLeafWords * SlotSize; + unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? + MF.getFunction()->arg_size() - CCRegisteredArgs : 0; + unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; + + assert(STI.isTargetLinux() && + "HiPE prologue is only supported on Linux operating systems."); + + // Compute the largest caller's frame that is needed to fit the callees' + // frames. This 'MaxStack' is computed from: + // + // a) the fixed frame size, which is the space needed for all spilled temps, + // b) outgoing on-stack parameter areas, and + // c) the minimum stack space this function needs to make available for the + // functions it calls (a tunable ABI property). + if (MFI->hasCalls()) { + unsigned MoreStackForCalls = 0; + + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) + for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); + MI != ME; ++MI) { + if (!MI->isCall()) + continue; + + // Get callee operand. + const MachineOperand &MO = MI->getOperand(0); + + // Only take account of global function calls (no closures etc.). + if (!MO.isGlobal()) + continue; + + const Function *F = dyn_cast<Function>(MO.getGlobal()); + if (!F) + continue; + + // Do not update 'MaxStack' for primitive and built-in functions + // (encoded with names either starting with "erlang."/"bif_" or not + // having a ".", such as a simple <Module>.<Function>.<Arity>, or an + // "_", such as the BIF "suspend_0") as they are executed on another + // stack. + if (F->getName().find("erlang.") != StringRef::npos || + F->getName().find("bif_") != StringRef::npos || + F->getName().find_first_of("._") == StringRef::npos) + continue; + + unsigned CalleeStkArity = + F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; + if (HipeLeafWords - 1 > CalleeStkArity) + MoreStackForCalls = std::max(MoreStackForCalls, + (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); + } + MaxStack += MoreStackForCalls; + } + + // If the stack frame needed is larger than the guaranteed then runtime checks + // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. + if (MaxStack > Guaranteed) { + MachineBasicBlock &prologueMBB = MF.front(); + MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); + + for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), + E = prologueMBB.livein_end(); I != E; I++) { + stackCheckMBB->addLiveIn(*I); + incStackMBB->addLiveIn(*I); + } + + MF.push_front(incStackMBB); + MF.push_front(stackCheckMBB); + + unsigned ScratchReg, SPReg, PReg, SPLimitOffset; + unsigned LEAop, CMPop, CALLop; + if (Is64Bit) { + SPReg = X86::RSP; + PReg = X86::RBP; + LEAop = X86::LEA64r; + CMPop = X86::CMP64rm; + CALLop = X86::CALL64pcrel32; + SPLimitOffset = 0x90; + } else { + SPReg = X86::ESP; + PReg = X86::EBP; + LEAop = X86::LEA32r; + CMPop = X86::CMP32rm; + CALLop = X86::CALLpcrel32; + SPLimitOffset = 0x4c; + } + + ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "HiPE prologue scratch register is live-in"); + + // Create new MBB for StackCheck: + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + // SPLimitOffset is in a fixed heap location (pointed by BP). + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); + + // Create new MBB for IncStack: + BuildMI(incStackMBB, DL, TII.get(CALLop)). + addExternalSymbol("inc_stack_0"); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); + + stackCheckMBB->addSuccessor(&prologueMBB, 99); + stackCheckMBB->addSuccessor(incStackMBB, 1); + incStackMBB->addSuccessor(&prologueMBB, 99); + incStackMBB->addSuccessor(incStackMBB, 1); + } +#ifdef XDEBUG + MF.verify(); +#endif +} + +bool X86FrameLowering:: +convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, uint64_t Amount) const { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case now - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps. + std::map<int64_t, MachineBasicBlock::iterator> MovMap; + do { + int Opcode = I->getOpcode(); + if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) + break; + + // We only want movs of the form: + // movl imm/r32, k(%ecx) + // If we run into something else, bail + // Note that AddrBaseReg may, counterintuitively, not be a register... + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return false; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + + // We don't want to consider the unaligned case. + if (StackDisp % 4) + return false; + + // If the same stack slot is being filled twice, something's fishy. + if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second) + return false; + + ++I; + } while (I != MBB.end()); + + // We now expect the end of the sequence - a call and a stack adjust. + if (I == MBB.end()) + return false; + if (!I->isCall()) + return false; + MachineBasicBlock::iterator Call = I; + if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) + return false; + + // Now, go through the map, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + // Since std::map provides ordered iteration, the original order + // of the MOVs doesn't matter. + int64_t ExpectedDist = 0; + for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; + ++MMI, ExpectedDist += 4) + if (MMI->first != ExpectedDist) + return false; + + // Ok, everything looks fine. Do the transformation. + DebugLoc DL = I->getDebugLoc(); + + // It's possible the original stack adjustment amount was larger than + // that done by the pushes. If so, we still need a SUB. + Amount -= ExpectedDist; + if (Amount) { + MachineInstr* Sub = BuildMI(MBB, Call, DL, + TII.get(getSUBriOpcode(false, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + Sub->getOperand(3).setIsDead(); + } + + // Now, iterate through the map in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. + for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { + MachineBasicBlock::iterator MOV = MMI->second; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + + // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size + int PushOpcode = X86::PUSH32r; + if (MOV->getOpcode() == X86::MOV32mi) + PushOpcode = getPUSHiOpcode(false, PushOp); + + BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp); + MBB.erase(MOV); + } + + return true; +} + +void X86FrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + bool reserveCallFrame = hasReservedCallFrame(MF); + int Opcode = I->getOpcode(); + bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + bool IsLP64 = STI.isTarget64BitLP64(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reserveCallFrame) { + // If the stack pointer can be changed after prologue, turn the + // adjcallstackup instruction into a 'sub ESP, <amt>' and the + // adjcallstackdown instruction into 'add ESP, <amt>' + if (Amount == 0) + return; + + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned StackAlign = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment(); + Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + + MachineInstr *New = nullptr; + if (Opcode == TII.getCallFrameSetupOpcode()) { + // Try to convert movs to the stack into pushes. + // We currently only look for a pattern that appears in 32-bit + // calling conventions. + if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) + return; + + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), + StackPtr) + .addReg(StackPtr) + .addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); + + // Factor out the amount the callee already popped. + Amount -= CalleeAmt; + + if (Amount) { + unsigned Opc = getADDriOpcode(IsLP64, Amount); + New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); + } + } + + if (New) { + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // Replace the pseudo instruction with a new instruction. + MBB.insert(I, New); + } + + return; + } + + if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(CalleeAmt); + + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // We are not tracking the stack pointer adjustment by the callee, so make + // sure we restore the stack pointer immediately after the call, there may + // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + MachineBasicBlock::iterator B = MBB.begin(); + while (I != B && !std::prev(I)->isCall()) + --I; + MBB.insert(I, New); + } +} + diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index d93c9467182..dd8fc3240c2 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -1,97 +1,95 @@ -//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class implements X86-specific bits of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
-#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
-
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-class MCSymbol;
-class X86TargetMachine;
-class X86Subtarget;
-
-class X86FrameLowering : public TargetFrameLowering {
-public:
- explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
- : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
-
- /// Emit a call to the target's stack probe function. This is required for all
- /// large stack allocations on Windows. The caller is required to materialize
- /// the number of bytes to probe in RAX/EAX.
- static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL);
-
- void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) const;
-
- /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
- /// the function.
- void emitPrologue(MachineFunction &MF) const override;
- void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-
- void adjustForSegmentedStacks(MachineFunction &MF) const override;
-
- void adjustForHiPEPrologue(MachineFunction &MF) const override;
-
- void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
- RegScavenger *RS = nullptr) const override;
-
- bool
- assignCalleeSavedSpillSlots(MachineFunction &MF,
- const TargetRegisterInfo *TRI,
- std::vector<CalleeSavedInfo> &CSI) const override;
-
- bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
-
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
-
- bool hasFP(const MachineFunction &MF) const override;
- bool hasReservedCallFrame(const MachineFunction &MF) const override;
- bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
- bool needsFrameIndexResolution(const MachineFunction &MF) const override;
-
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
- int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
-
- int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
- int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
-
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const override;
-
-private:
- /// convertArgMovsToPushes - This method tries to convert a call sequence
- /// that uses sub and mov instructions to put the argument onto the stack
- /// into a series of pushes.
- /// Returns true if the transformation succeeded, false if not.
- bool convertArgMovsToPushes(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- uint64_t Amount) const;
-};
-
-} // End llvm namespace
-
-#endif
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements X86-specific bits of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H +#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class MCSymbol; +class X86TargetMachine; +class X86Subtarget; + +class X86FrameLowering : public TargetFrameLowering { +public: + explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) + : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} + + /// Emit a call to the target's stack probe function. This is required for all + /// large stack allocations on Windows. The caller is required to materialize + /// the number of bytes to probe in RAX/EAX. + static void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL); + + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + void adjustForSegmentedStacks(MachineFunction &MF) const override; + + void adjustForHiPEPrologue(MachineFunction &MF) const override; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = nullptr) const override; + + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; + + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; + int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + +private: + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; +}; + +} // End llvm namespace + +#endif diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index fa1dfa7a524..be0e4b790a2 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1,1852 +1,1848 @@ -//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the various pseudo instructions used by the compiler,
-// as well as Pat patterns used during instruction selection.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Pattern Matching Support
-
-def GetLo32XForm : SDNodeXForm<imm, [{
- // Transformation function: get the low 32 bits.
- return getI32Imm((unsigned)N->getZExtValue());
-}]>;
-
-def GetLo8XForm : SDNodeXForm<imm, [{
- // Transformation function: get the low 8 bits.
- return getI8Imm((uint8_t)N->getZExtValue());
-}]>;
-
-
-//===----------------------------------------------------------------------===//
-// Random Pseudo Instructions.
-
-// PIC base construction. This expands to code that looks like this:
-// call $next_inst
-// popl %destreg"
-let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
- def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
- "", []>;
-
-
-// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
-// a stack adjustment and the codegen must know that they may modify the stack
-// pointer before prolog-epilog rewriting occurs.
-// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
-// sub / add which can clobber EFLAGS.
-let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
- "#ADJCALLSTACKDOWN",
- []>,
- Requires<[NotLP64]>;
-def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
- "#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)]>,
- Requires<[NotLP64]>;
-}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
-
-
-// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
-// a stack adjustment and the codegen must know that they may modify the stack
-// pointer before prolog-epilog rewriting occurs.
-// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
-// sub / add which can clobber EFLAGS.
-let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
- "#ADJCALLSTACKDOWN",
- []>,
- Requires<[IsLP64]>;
-def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
- "#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)]>,
- Requires<[IsLP64]>;
-}
-def : Pat<(X86callseq_start timm:$amt1),
- (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
-
-
-// x86-64 va_start lowering magic.
-let usesCustomInserter = 1, Defs = [EFLAGS] in {
-def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
- (outs),
- (ins GR8:$al,
- i64imm:$regsavefi, i64imm:$offset,
- variable_ops),
- "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
- [(X86vastart_save_xmm_regs GR8:$al,
- imm:$regsavefi,
- imm:$offset),
- (implicit EFLAGS)]>;
-
-// The VAARG_64 pseudo-instruction takes the address of the va_list,
-// and places the address of the next argument into a register.
-let Defs = [EFLAGS] in
-def VAARG_64 : I<0, Pseudo,
- (outs GR64:$dst),
- (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
- "#VAARG_64 $dst, $ap, $size, $mode, $align",
- [(set GR64:$dst,
- (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
- (implicit EFLAGS)]>;
-
-// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
-// targets. These calls are needed to probe the stack when allocating more than
-// 4k bytes in one go. Touching the stack at 4K increments is necessary to
-// ensure that the guard pages used by the OS virtual memory manager are
-// allocated in correct sequence.
-// The main point of having separate instruction are extra unmodelled effects
-// (compared to ordinary calls) like stack pointer change.
-
-let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
- def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
- "# dynamic stack allocation",
- [(X86WinAlloca)]>;
-
-// When using segmented stacks these are lowered into instructions which first
-// check if the current stacklet has enough free memory. If it does, memory is
-// allocated by bumping the stack pointer. Otherwise memory is allocated from
-// the heap.
-
-let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
-def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
- "# variable sized alloca for segmented stacks",
- [(set GR32:$dst,
- (X86SegAlloca GR32:$size))]>,
- Requires<[NotLP64]>;
-
-let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
-def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
- "# variable sized alloca for segmented stacks",
- [(set GR64:$dst,
- (X86SegAlloca GR64:$size))]>,
- Requires<[In64BitMode]>;
-}
-
-// The MSVC runtime contains an _ftol2 routine for converting floating-point
-// to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
-// used as a temporary register. No other registers (aside from flags) are
-// touched.
-// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
-// variant is unnecessary.
-
-let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
- def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
- "# win32 fptoui",
- [(X86WinFTOL RFP32:$src)]>,
- Requires<[Not64BitMode]>;
-
- def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
- "# win32 fptoui",
- [(X86WinFTOL RFP64:$src)]>,
- Requires<[Not64BitMode]>;
-}
-
-//===----------------------------------------------------------------------===//
-// EH Pseudo Instructions
-//
-let SchedRW = [WriteSystem] in {
-let isTerminator = 1, isReturn = 1, isBarrier = 1,
- hasCtrlDep = 1, isCodeGenOnly = 1 in {
-def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
- "ret\t#eh_return, addr: $addr",
- [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
-
-}
-
-let isTerminator = 1, isReturn = 1, isBarrier = 1,
- hasCtrlDep = 1, isCodeGenOnly = 1 in {
-def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
- "ret\t#eh_return, addr: $addr",
- [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
-
-}
-
-let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
- usesCustomInserter = 1 in {
- def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
- "#EH_SJLJ_SETJMP32",
- [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
- Requires<[Not64BitMode]>;
- def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
- "#EH_SJLJ_SETJMP64",
- [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
- Requires<[In64BitMode]>;
- let isTerminator = 1 in {
- def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
- "#EH_SJLJ_LONGJMP32",
- [(X86eh_sjlj_longjmp addr:$buf)]>,
- Requires<[Not64BitMode]>;
- def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
- "#EH_SJLJ_LONGJMP64",
- [(X86eh_sjlj_longjmp addr:$buf)]>,
- Requires<[In64BitMode]>;
- }
-}
-} // SchedRW
-
-let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
- def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
- "#EH_SjLj_Setup\t$dst", []>;
-}
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions used by unwind info.
-//
-let isPseudo = 1 in {
- def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
- "#SEH_PushReg $reg", []>;
- def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
- "#SEH_SaveReg $reg, $dst", []>;
- def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
- "#SEH_SaveXMM $reg, $dst", []>;
- def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
- "#SEH_StackAlloc $size", []>;
- def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
- "#SEH_SetFrame $reg, $offset", []>;
- def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
- "#SEH_PushFrame $mode", []>;
- def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
- "#SEH_EndPrologue", []>;
- def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
- "#SEH_Epilogue", []>;
-}
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions used by segmented stacks.
-//
-
-// This is lowered into a RET instruction by MCInstLower. We need
-// this so that we don't have to have a MachineBasicBlock which ends
-// with a RET and also has successors.
-let isPseudo = 1 in {
-def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
- "", []>;
-
-// This instruction is lowered to a RET followed by a MOV. The two
-// instructions are not generated on a higher level since then the
-// verifier sees a MachineBasicBlock ending with a non-terminator.
-def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
- "", []>;
-}
-
-//===----------------------------------------------------------------------===//
-// Alias Instructions
-//===----------------------------------------------------------------------===//
-
-// Alias instruction mapping movr0 to xor.
-// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
-let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1 in
-def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
-
-// Other widths can also make use of the 32-bit xor, which may have a smaller
-// encoding and avoid partial register updates.
-def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
-def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
- let AddedComplexity = 20;
-}
-
-// Materialize i64 constant where top 32-bits are zero. This could theoretically
-// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
-// that would make it more difficult to rematerialize.
-let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
- isCodeGenOnly = 1, hasSideEffects = 0 in
-def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
- "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
-
-// This 64-bit pseudo-move can be used for both a 64-bit constant that is
-// actually the zero-extension of a 32-bit constant, and for labels in the
-// x86-64 small code model.
-def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
-
-let AddedComplexity = 1 in
-def : Pat<(i64 mov64imm32:$src),
- (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
-
-// Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
-// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
-// However, Pat<> can't replicate the destination reg into the inputs of the
-// result.
-def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
- [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
- [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
- [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-} // isCodeGenOnly
-
-
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C64r)>;
-
-def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C64r)>;
-
-// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
-// will be eliminated and that the sbb can be extended up to a wider type. When
-// this happens, it is great. However, if we are left with an 8-bit sbb and an
-// and, we might as well just match it as a setb.
-def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
- (SETBr)>;
-
-// (add OP, SETB) -> (adc OP, 0)
-def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
- (ADC8ri GR8:$op, 0)>;
-def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
- (ADC32ri8 GR32:$op, 0)>;
-def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
- (ADC64ri8 GR64:$op, 0)>;
-
-// (sub OP, SETB) -> (sbb OP, 0)
-def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
- (SBB8ri GR8:$op, 0)>;
-def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
- (SBB32ri8 GR32:$op, 0)>;
-def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
- (SBB64ri8 GR64:$op, 0)>;
-
-// (sub OP, SETCC_CARRY) -> (adc OP, 0)
-def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
- (ADC8ri GR8:$op, 0)>;
-def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
- (ADC32ri8 GR32:$op, 0)>;
-def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
- (ADC64ri8 GR64:$op, 0)>;
-
-//===----------------------------------------------------------------------===//
-// String Pseudo Instructions
-//
-let SchedRW = [WriteMicrocoded] in {
-let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
-def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
- [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
- Requires<[Not64BitMode]>;
-def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
- [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
- Requires<[Not64BitMode]>;
-def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
- [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
- Requires<[Not64BitMode]>;
-}
-
-let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
-def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
- [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
- Requires<[In64BitMode]>;
-def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
- [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
- Requires<[In64BitMode]>;
-def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
- [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
- Requires<[In64BitMode]>;
-def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
- [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
- Requires<[In64BitMode]>;
-}
-
-// FIXME: Should use "(X86rep_stos AL)" as the pattern.
-let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
- let Uses = [AL,ECX,EDI] in
- def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
- [(X86rep_stos i8)], IIC_REP_STOS>, REP,
- Requires<[Not64BitMode]>;
- let Uses = [AX,ECX,EDI] in
- def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
- [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
- Requires<[Not64BitMode]>;
- let Uses = [EAX,ECX,EDI] in
- def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
- [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
- Requires<[Not64BitMode]>;
-}
-
-let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
- let Uses = [AL,RCX,RDI] in
- def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
- [(X86rep_stos i8)], IIC_REP_STOS>, REP,
- Requires<[In64BitMode]>;
- let Uses = [AX,RCX,RDI] in
- def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
- [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
- Requires<[In64BitMode]>;
- let Uses = [RAX,RCX,RDI] in
- def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
- [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
- Requires<[In64BitMode]>;
-
- let Uses = [RAX,RCX,RDI] in
- def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
- [(X86rep_stos i64)], IIC_REP_STOS>, REP,
- Requires<[In64BitMode]>;
-}
-} // SchedRW
-
-//===----------------------------------------------------------------------===//
-// Thread Local Storage Instructions
-//
-
-// ELF TLS Support
-// All calls clobber the non-callee saved registers. ESP is marked as
-// a use to prevent stack-pointer assignments that appear immediately
-// before calls from potentially appearing dead.
-let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
- ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [ESP] in {
-def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
- "# TLS_addr32",
- [(X86tlsaddr tls32addr:$sym)]>,
- Requires<[Not64BitMode]>;
-def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
- "# TLS_base_addr32",
- [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
- Requires<[Not64BitMode]>;
-}
-
-// All calls clobber the non-callee saved registers. RSP is marked as
-// a use to prevent stack-pointer assignments that appear immediately
-// before calls from potentially appearing dead.
-let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
- FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
- ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
- MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
- XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
- XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- Uses = [RSP] in {
-def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
- "# TLS_addr64",
- [(X86tlsaddr tls64addr:$sym)]>,
- Requires<[In64BitMode]>;
-def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
- "# TLS_base_addr64",
- [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
- Requires<[In64BitMode]>;
-}
-
-// Darwin TLS Support
-// For i386, the address of the thunk is passed on the stack, on return the
-// address of the variable is in %eax. %ecx is trashed during the function
-// call. All other registers are preserved.
-let Defs = [EAX, ECX, EFLAGS],
- Uses = [ESP],
- usesCustomInserter = 1 in
-def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
- "# TLSCall_32",
- [(X86TLSCall addr:$sym)]>,
- Requires<[Not64BitMode]>;
-
-// For x86_64, the address of the thunk is passed in %rdi, on return
-// the address of the variable is in %rax. All other registers are preserved.
-let Defs = [RAX, EFLAGS],
- Uses = [RSP, RDI],
- usesCustomInserter = 1 in
-def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
- "# TLSCall_64",
- [(X86TLSCall addr:$sym)]>,
- Requires<[In64BitMode]>;
-
-
-//===----------------------------------------------------------------------===//
-// Conditional Move Pseudo Instructions
-
-// X86 doesn't have 8-bit conditional moves. Use a customInserter to
-// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
-// however that requires promoting the operands, and can induce additional
-// i8 register pressure.
-let usesCustomInserter = 1, Uses = [EFLAGS] in {
-def CMOV_GR8 : I<0, Pseudo,
- (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
- "#CMOV_GR8 PSEUDO!",
- [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
- imm:$cond, EFLAGS))]>;
-
-let Predicates = [NoCMov] in {
-def CMOV_GR32 : I<0, Pseudo,
- (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
- "#CMOV_GR32* PSEUDO!",
- [(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
-def CMOV_GR16 : I<0, Pseudo,
- (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
- "#CMOV_GR16* PSEUDO!",
- [(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
-} // Predicates = [NoCMov]
-
-// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
-// SSE1.
-let Predicates = [FPStackf32] in
-def CMOV_RFP32 : I<0, Pseudo,
- (outs RFP32:$dst),
- (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
- "#CMOV_RFP32 PSEUDO!",
- [(set RFP32:$dst,
- (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
- EFLAGS))]>;
-// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
-// SSE2.
-let Predicates = [FPStackf64] in
-def CMOV_RFP64 : I<0, Pseudo,
- (outs RFP64:$dst),
- (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
- "#CMOV_RFP64 PSEUDO!",
- [(set RFP64:$dst,
- (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
- EFLAGS))]>;
-def CMOV_RFP80 : I<0, Pseudo,
- (outs RFP80:$dst),
- (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
- "#CMOV_RFP80 PSEUDO!",
- [(set RFP80:$dst,
- (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
- EFLAGS))]>;
-} // UsesCustomInserter = 1, Uses = [EFLAGS]
-
-
-//===----------------------------------------------------------------------===//
-// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
-//===----------------------------------------------------------------------===//
-
-// FIXME: Use normal instructions and add lock prefix dynamically.
-
-// Memory barriers
-
-// TODO: Get this to fold the constant into the instruction.
-let isCodeGenOnly = 1, Defs = [EFLAGS] in
-def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
- "or{l}\t{$zero, $dst|$dst, $zero}",
- [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
- Sched<[WriteALULd, WriteRMW]>;
-
-let hasSideEffects = 1 in
-def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
- "#MEMBARRIER",
- [(X86MemBarrier)]>, Sched<[WriteLoad]>;
-
-// RegOpc corresponds to the mr version of the instruction
-// ImmOpc corresponds to the mi version of the instruction
-// ImmOpc8 corresponds to the mi8 version of the instruction
-// ImmMod corresponds to the instruction format of the mi and mi8 versions
-multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
- Format ImmMod, string mnemonic> {
-let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
- SchedRW = [WriteALULd, WriteRMW] in {
-
-def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
- MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
- !strconcat(mnemonic, "{b}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, LOCK;
-def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, OpSize16, LOCK;
-def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, OpSize32, LOCK;
-def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, LOCK;
-
-def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
- ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
- !strconcat(mnemonic, "{b}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-
-def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize16, LOCK;
-
-def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize32, LOCK;
-
-def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-
-def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize16, LOCK;
-def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize32, LOCK;
-def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-
-}
-
-}
-
-defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
-defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
-defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
-defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
-defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
-
-// Optimized codegen when the non-memory output is not used.
-multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
- string mnemonic> {
-let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
- SchedRW = [WriteALULd, WriteRMW] in {
-
-def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
- !strconcat(mnemonic, "{b}\t$dst"),
- [], IIC_UNARY_MEM>, LOCK;
-def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
- !strconcat(mnemonic, "{w}\t$dst"),
- [], IIC_UNARY_MEM>, OpSize16, LOCK;
-def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
- !strconcat(mnemonic, "{l}\t$dst"),
- [], IIC_UNARY_MEM>, OpSize32, LOCK;
-def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
- !strconcat(mnemonic, "{q}\t$dst"),
- [], IIC_UNARY_MEM>, LOCK;
-}
-}
-
-defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
-defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
-
-// Atomic compare and swap.
-multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
- SDPatternOperator frag, X86MemOperand x86memop,
- InstrItinClass itin> {
-let isCodeGenOnly = 1 in {
- def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
- !strconcat(mnemonic, "\t$ptr"),
- [(frag addr:$ptr)], itin>, TB, LOCK;
-}
-}
-
-multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
- string mnemonic, SDPatternOperator frag,
- InstrItinClass itin8, InstrItinClass itin> {
-let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
- let Defs = [AL, EFLAGS], Uses = [AL] in
- def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
- !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
- let Defs = [AX, EFLAGS], Uses = [AX] in
- def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
- !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
- let Defs = [EAX, EFLAGS], Uses = [EAX] in
- def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
- !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
- let Defs = [RAX, EFLAGS], Uses = [RAX] in
- def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
- !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
-}
-}
-
-let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
- SchedRW = [WriteALULd, WriteRMW] in {
-defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
- X86cas8, i64mem,
- IIC_CMPX_LOCK_8B>;
-}
-
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
- Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
-defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
- X86cas16, i128mem,
- IIC_CMPX_LOCK_16B>, REX_W;
-}
-
-defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
- X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
-
-// Atomic exchange and add
-multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
- string frag,
- InstrItinClass itin8, InstrItinClass itin> {
- let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
- SchedRW = [WriteALULd, WriteRMW] in {
- def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
- (ins GR8:$val, i8mem:$ptr),
- !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
- [(set GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
- itin8>;
- def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
- (ins GR16:$val, i16mem:$ptr),
- !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
- itin>, OpSize16;
- def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
- (ins GR32:$val, i32mem:$ptr),
- !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
- itin>, OpSize32;
- def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
- (ins GR64:$val, i64mem:$ptr),
- !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
- itin>;
- }
-}
-
-defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
- IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
- TB, LOCK;
-
-/* The following multiclass tries to make sure that in code like
- * x.store (immediate op x.load(acquire), release)
- * an operation directly on memory is generated instead of wasting a register.
- * It is not automatic as atomic_store/load are only lowered to MOV instructions
- * extremely late to prevent them from being accidentally reordered in the backend
- * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
- */
-multiclass RELEASE_BINOP_MI<string op> {
- def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
- (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
- // NAME#16 is not generated as 16-bit arithmetic instructions are considered
- // costly and avoided as far as possible by this backend anyway
- def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
- (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
- def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
- "#RELEASE_BINOP PSEUDO!",
- [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
- (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
-}
-defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
-defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
-defm RELEASE_OR : RELEASE_BINOP_MI<"or">;
-defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
-// Note: we don't deal with sub, because substractions of constants are
-// optimized into additions before this code can run
-
-multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
- def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
- [(atomic_store_8 addr:$dst, dag8)]>;
- def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
- [(atomic_store_16 addr:$dst, dag16)]>;
- def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
- [(atomic_store_32 addr:$dst, dag32)]>;
- def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
- "#RELEASE_UNOP PSEUDO!",
- [(atomic_store_64 addr:$dst, dag64)]>;
-}
-
-defm RELEASE_INC : RELEASE_UNOP<
- (add (atomic_load_8 addr:$dst), (i8 1)),
- (add (atomic_load_16 addr:$dst), (i16 1)),
- (add (atomic_load_32 addr:$dst), (i32 1)),
- (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
-defm RELEASE_DEC : RELEASE_UNOP<
- (add (atomic_load_8 addr:$dst), (i8 -1)),
- (add (atomic_load_16 addr:$dst), (i16 -1)),
- (add (atomic_load_32 addr:$dst), (i32 -1)),
- (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
-/*
-TODO: These don't work because the type inference of TableGen fails.
-TODO: find a way to fix it.
-defm RELEASE_NEG : RELEASE_UNOP<
- (ineg (atomic_load_8 addr:$dst)),
- (ineg (atomic_load_16 addr:$dst)),
- (ineg (atomic_load_32 addr:$dst)),
- (ineg (atomic_load_64 addr:$dst))>;
-defm RELEASE_NOT : RELEASE_UNOP<
- (not (atomic_load_8 addr:$dst)),
- (not (atomic_load_16 addr:$dst)),
- (not (atomic_load_32 addr:$dst)),
- (not (atomic_load_64 addr:$dst))>;
-*/
-
-def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
- "#RELEASE_MOV PSEUDO !",
- [(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
-def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
- "#RELEASE_MOV PSEUDO !",
- [(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
-def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
- "#RELEASE_MOV PSEUDO !",
- [(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
-def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
- "#RELEASE_MOV PSEUDO !",
- [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
-
-def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
- "#RELEASE_MOV PSEUDO!",
- [(atomic_store_8 addr:$dst, GR8 :$src)]>;
-def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
- "#RELEASE_MOV PSEUDO!",
- [(atomic_store_16 addr:$dst, GR16:$src)]>;
-def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
- "#RELEASE_MOV PSEUDO!",
- [(atomic_store_32 addr:$dst, GR32:$src)]>;
-def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
- "#RELEASE_MOV PSEUDO!",
- [(atomic_store_64 addr:$dst, GR64:$src)]>;
-
-def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
- "#ACQUIRE_MOV PSEUDO!",
- [(set GR8:$dst, (atomic_load_8 addr:$src))]>;
-def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
- [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
-def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
- [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
-def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
- "#ACQUIRE_MOV PSEUDO!",
- [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
-//===----------------------------------------------------------------------===//
-// Conditional Move Pseudo Instructions.
-//===----------------------------------------------------------------------===//
-
-// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after
-// instruction selection into a branch sequence.
-let Uses = [EFLAGS], usesCustomInserter = 1 in {
- def CMOV_FR32 : I<0, Pseudo,
- (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
- "#CMOV_FR32 PSEUDO!",
- [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
- EFLAGS))]>;
- def CMOV_FR64 : I<0, Pseudo,
- (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
- "#CMOV_FR64 PSEUDO!",
- [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
- EFLAGS))]>;
- def CMOV_V4F32 : I<0, Pseudo,
- (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
- "#CMOV_V4F32 PSEUDO!",
- [(set VR128:$dst,
- (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V2F64 : I<0, Pseudo,
- (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
- "#CMOV_V2F64 PSEUDO!",
- [(set VR128:$dst,
- (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V2I64 : I<0, Pseudo,
- (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
- "#CMOV_V2I64 PSEUDO!",
- [(set VR128:$dst,
- (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V8F32 : I<0, Pseudo,
- (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
- "#CMOV_V8F32 PSEUDO!",
- [(set VR256:$dst,
- (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V4F64 : I<0, Pseudo,
- (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
- "#CMOV_V4F64 PSEUDO!",
- [(set VR256:$dst,
- (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V4I64 : I<0, Pseudo,
- (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
- "#CMOV_V4I64 PSEUDO!",
- [(set VR256:$dst,
- (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V8I64 : I<0, Pseudo,
- (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
- "#CMOV_V8I64 PSEUDO!",
- [(set VR512:$dst,
- (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V8F64 : I<0, Pseudo,
- (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
- "#CMOV_V8F64 PSEUDO!",
- [(set VR512:$dst,
- (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
- EFLAGS)))]>;
- def CMOV_V16F32 : I<0, Pseudo,
- (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
- "#CMOV_V16F32 PSEUDO!",
- [(set VR512:$dst,
- (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
- EFLAGS)))]>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// DAG Pattern Matching Rules
-//===----------------------------------------------------------------------===//
-
-// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
-def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
-def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
-def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
-def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
-def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
-def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
-
-def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
- (ADD32ri GR32:$src1, tconstpool:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
- (ADD32ri GR32:$src1, tjumptable:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
- (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
- (ADD32ri GR32:$src1, texternalsym:$src2)>;
-def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
- (ADD32ri GR32:$src1, tblockaddress:$src2)>;
-
-def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
- (MOV32mi addr:$dst, tglobaladdr:$src)>;
-def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
- (MOV32mi addr:$dst, texternalsym:$src)>;
-def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
- (MOV32mi addr:$dst, tblockaddress:$src)>;
-
-// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
-// code model mode, should use 'movabs'. FIXME: This is really a hack, the
-// 'movabs' predicate should handle this sort of thing.
-def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
- (MOV64ri tconstpool :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
- (MOV64ri tjumptable :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
- (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
- (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
- (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
-
-// In kernel code model, we can get the address of a label
-// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
-// the MOV64ri32 should accept these.
-def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
- (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
- (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
- (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
- (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
- (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
-
-// If we have small model and -static mode, it is safe to store global addresses
-// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
-// for MOV64mi32 should handle this sort of thing.
-def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
- (MOV64mi32 addr:$dst, tconstpool:$src)>,
- Requires<[NearData, IsStatic]>;
-def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
- (MOV64mi32 addr:$dst, tjumptable:$src)>,
- Requires<[NearData, IsStatic]>;
-def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
- (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
- Requires<[NearData, IsStatic]>;
-def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
- (MOV64mi32 addr:$dst, texternalsym:$src)>,
- Requires<[NearData, IsStatic]>;
-def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
- (MOV64mi32 addr:$dst, tblockaddress:$src)>,
- Requires<[NearData, IsStatic]>;
-
-def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
-def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>;
-
-// Calls
-
-// tls has some funny stuff here...
-// This corresponds to movabs $foo@tpoff, %rax
-def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
- (MOV64ri32 tglobaltlsaddr :$dst)>;
-// This corresponds to add $foo@tpoff, %rax
-def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
- (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
-
-
-// Direct PC relative function call for small code model. 32-bit displacement
-// sign extended to 64-bit.
-def : Pat<(X86call (i64 tglobaladdr:$dst)),
- (CALL64pcrel32 tglobaladdr:$dst)>;
-def : Pat<(X86call (i64 texternalsym:$dst)),
- (CALL64pcrel32 texternalsym:$dst)>;
-
-// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
-// can never use callee-saved registers. That is the purpose of the GR64_TC
-// register classes.
-//
-// The only volatile register that is never used by the calling convention is
-// %r11. This happens when calling a vararg function with 6 arguments.
-//
-// Match an X86tcret that uses less than 7 volatile registers.
-def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
- (X86tcret node:$ptr, node:$off), [{
- // X86tcret args: (*chain, ptr, imm, regs..., glue)
- unsigned NumRegs = 0;
- for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
- if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
- return false;
- return true;
-}]>;
-
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
- Requires<[Not64BitMode]>;
-
-// FIXME: This is disabled for 32-bit PIC mode because the global base
-// register which is part of the address mode may be assigned a
-// callee-saved register.
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
- (TCRETURNmi addr:$dst, imm:$off)>,
- Requires<[Not64BitMode, IsNotPIC]>;
-
-def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
- (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
- Requires<[NotLP64]>;
-
-def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
- (TCRETURNdi texternalsym:$dst, imm:$off)>,
- Requires<[NotLP64]>;
-
-def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
- (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
- Requires<[In64BitMode]>;
-
-// Don't fold loads into X86tcret requiring more than 6 regs.
-// There wouldn't be enough scratch registers for base+index.
-def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
- (TCRETURNmi64 addr:$dst, imm:$off)>,
- Requires<[In64BitMode]>;
-
-def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
- (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
- Requires<[IsLP64]>;
-
-def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
- (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
- Requires<[IsLP64]>;
-
-// Normal calls, with various flavors of addresses.
-def : Pat<(X86call (i32 tglobaladdr:$dst)),
- (CALLpcrel32 tglobaladdr:$dst)>;
-def : Pat<(X86call (i32 texternalsym:$dst)),
- (CALLpcrel32 texternalsym:$dst)>;
-def : Pat<(X86call (i32 imm:$dst)),
- (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
-
-// Comparisons.
-
-// TEST R,R is smaller than CMP R,0
-def : Pat<(X86cmp GR8:$src1, 0),
- (TEST8rr GR8:$src1, GR8:$src1)>;
-def : Pat<(X86cmp GR16:$src1, 0),
- (TEST16rr GR16:$src1, GR16:$src1)>;
-def : Pat<(X86cmp GR32:$src1, 0),
- (TEST32rr GR32:$src1, GR32:$src1)>;
-def : Pat<(X86cmp GR64:$src1, 0),
- (TEST64rr GR64:$src1, GR64:$src1)>;
-
-// Conditional moves with folded loads with operands swapped and conditions
-// inverted.
-multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
- Instruction Inst64> {
- let Predicates = [HasCMov] in {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
- (Inst16 GR16:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
- (Inst32 GR32:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
- (Inst64 GR64:$src2, addr:$src1)>;
- }
-}
-
-defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
-defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
-defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
-defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
-defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
-defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
-defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
-defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
-defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
-defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
-defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
-defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
-defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
-defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
-defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
-defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
-
-// zextload bool -> zextload byte
-def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
-def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
-def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(zextloadi64i1 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
-
-// extload bool -> extload byte
-// When extloading from 16-bit and smaller memory locations into 64-bit
-// registers, use zero-extending loads so that the entire 64-bit register is
-// defined, avoiding partial-register updates.
-
-def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
-def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
-def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
-def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
-
-// For other extloads, use subregs, since the high contents of the register are
-// defined after an extload.
-def : Pat<(extloadi64i1 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i8 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i16 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
-def : Pat<(extloadi64i32 addr:$src),
- (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
-
-// anyext. Define these to do an explicit zero-extend to
-// avoid partial-register updates.
-def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
- (MOVZX32rr8 GR8 :$src), sub_16bit)>;
-def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
-
-// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
-def : Pat<(i32 (anyext GR16:$src)),
- (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
-
-def : Pat<(i64 (anyext GR8 :$src)),
- (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>;
-def : Pat<(i64 (anyext GR16:$src)),
- (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
-def : Pat<(i64 (anyext GR32:$src)),
- (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
-
-
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. And x86's cmov doesn't do anything if the
-// condition is false. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-def def32 : PatLeaf<(i32 GR32:$src), [{
- return N->getOpcode() != ISD::TRUNCATE &&
- N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
- N->getOpcode() != ISD::CopyFromReg &&
- N->getOpcode() != ISD::AssertSext &&
- N->getOpcode() != X86ISD::CMOV;
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)),
- (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
-
-//===----------------------------------------------------------------------===//
-// Pattern match OR as ADD
-//===----------------------------------------------------------------------===//
-
-// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
-// 3-addressified into an LEA instruction to avoid copies. However, we also
-// want to finally emit these instructions as an or at the end of the code
-// generator to make the generated code easier to read. To do this, we select
-// into "disjoint bits" pseudo ops.
-
-// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
-def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
- if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
- return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
-
- APInt KnownZero0, KnownOne0;
- CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
- APInt KnownZero1, KnownOne1;
- CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
- return (~KnownZero0 & ~KnownZero1) == 0;
-}]>;
-
-
-// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
-// Try this before the selecting to OR.
-let AddedComplexity = 5, SchedRW = [WriteALU] in {
-
-let isConvertibleToThreeAddress = 1,
- Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
-let isCommutable = 1 in {
-def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
- "", // orw/addw REG, REG
- [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
-def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
- "", // orl/addl REG, REG
- [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
-def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
- "", // orq/addq REG, REG
- [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
-} // isCommutable
-
-// NOTE: These are order specific, we want the ri8 forms to be listed
-// first so that they are slightly preferred to the ri forms.
-
-def ADD16ri8_DB : I<0, Pseudo,
- (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
- "", // orw/addw REG, imm8
- [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
-def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
- "", // orw/addw REG, imm
- [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
-
-def ADD32ri8_DB : I<0, Pseudo,
- (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
- "", // orl/addl REG, imm8
- [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
-def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
- "", // orl/addl REG, imm
- [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
-
-
-def ADD64ri8_DB : I<0, Pseudo,
- (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
- "", // orq/addq REG, imm8
- [(set GR64:$dst, (or_is_add GR64:$src1,
- i64immSExt8:$src2))]>;
-def ADD64ri32_DB : I<0, Pseudo,
- (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
- "", // orq/addq REG, imm
- [(set GR64:$dst, (or_is_add GR64:$src1,
- i64immSExt32:$src2))]>;
-}
-} // AddedComplexity, SchedRW
-
-
-//===----------------------------------------------------------------------===//
-// Some peepholes
-//===----------------------------------------------------------------------===//
-
-// Odd encoding trick: -128 fits into an 8-bit immediate field while
-// +128 doesn't, so in this special case use a sub instead of an add.
-def : Pat<(add GR16:$src1, 128),
- (SUB16ri8 GR16:$src1, -128)>;
-def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
- (SUB16mi8 addr:$dst, -128)>;
-
-def : Pat<(add GR32:$src1, 128),
- (SUB32ri8 GR32:$src1, -128)>;
-def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
- (SUB32mi8 addr:$dst, -128)>;
-
-def : Pat<(add GR64:$src1, 128),
- (SUB64ri8 GR64:$src1, -128)>;
-def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
- (SUB64mi8 addr:$dst, -128)>;
-
-// The same trick applies for 32-bit immediate fields in 64-bit
-// instructions.
-def : Pat<(add GR64:$src1, 0x0000000080000000),
- (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
-def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
- (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
-
-// To avoid needing to materialize an immediate in a register, use a 32-bit and
-// with implicit zero-extension instead of a 64-bit and if the immediate has at
-// least 32 bits of leading zeros. If in addition the last 32 bits can be
-// represented with a sign extension of a 8 bit constant, use that.
-
-def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
- (SUBREG_TO_REG
- (i64 0),
- (AND32ri8
- (EXTRACT_SUBREG GR64:$src, sub_32bit),
- (i32 (GetLo8XForm imm:$imm))),
- sub_32bit)>;
-
-def : Pat<(and GR64:$src, i64immZExt32:$imm),
- (SUBREG_TO_REG
- (i64 0),
- (AND32ri
- (EXTRACT_SUBREG GR64:$src, sub_32bit),
- (i32 (GetLo32XForm imm:$imm))),
- sub_32bit)>;
-
-
-// r & (2^16-1) ==> movz
-def : Pat<(and GR32:$src1, 0xffff),
- (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR32:$src1, 0xff),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
- GR32_ABCD)),
- sub_8bit))>,
- Requires<[Not64BitMode]>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR16:$src1, 0xff),
- (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
- (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
-
-// r & (2^32-1) ==> movz
-def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
- (SUBREG_TO_REG (i64 0),
- (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
- sub_32bit)>;
-// r & (2^16-1) ==> movz
-def : Pat<(and GR64:$src, 0xffff),
- (SUBREG_TO_REG (i64 0),
- (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
- sub_32bit)>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR64:$src, 0xff),
- (SUBREG_TO_REG (i64 0),
- (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
- sub_32bit)>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR32:$src1, 0xff),
- (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
- Requires<[In64BitMode]>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR16:$src1, 0xff),
- (EXTRACT_SUBREG (MOVZX32rr8 (i8
- (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
- Requires<[In64BitMode]>;
-
-
-// sext_inreg patterns
-def : Pat<(sext_inreg GR32:$src, i16),
- (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
-def : Pat<(sext_inreg GR32:$src, i8),
- (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit))>,
- Requires<[Not64BitMode]>;
-
-def : Pat<(sext_inreg GR16:$src, i8),
- (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
- (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
-
-def : Pat<(sext_inreg GR64:$src, i32),
- (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
-def : Pat<(sext_inreg GR64:$src, i16),
- (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
-def : Pat<(sext_inreg GR64:$src, i8),
- (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
-def : Pat<(sext_inreg GR32:$src, i8),
- (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
- Requires<[In64BitMode]>;
-def : Pat<(sext_inreg GR16:$src, i8),
- (EXTRACT_SUBREG (MOVSX32rr8
- (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
- Requires<[In64BitMode]>;
-
-// sext, sext_load, zext, zext_load
-def: Pat<(i16 (sext GR8:$src)),
- (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
-def: Pat<(sextloadi16i8 addr:$src),
- (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
-def: Pat<(i16 (zext GR8:$src)),
- (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
-def: Pat<(zextloadi16i8 addr:$src),
- (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
-
-// trunc patterns
-def : Pat<(i16 (trunc GR32:$src)),
- (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
-def : Pat<(i8 (trunc GR32:$src)),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit)>,
- Requires<[Not64BitMode]>;
-def : Pat<(i8 (trunc GR16:$src)),
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit)>,
- Requires<[Not64BitMode]>;
-def : Pat<(i32 (trunc GR64:$src)),
- (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
-def : Pat<(i16 (trunc GR64:$src)),
- (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
-def : Pat<(i8 (trunc GR64:$src)),
- (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
-def : Pat<(i8 (trunc GR32:$src)),
- (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
- Requires<[In64BitMode]>;
-def : Pat<(i8 (trunc GR16:$src)),
- (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
- Requires<[In64BitMode]>;
-
-// h-register tricks
-def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)>,
- Requires<[Not64BitMode]>;
-def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi)>,
- Requires<[Not64BitMode]>;
-def : Pat<(srl GR16:$src, (i8 8)),
- (EXTRACT_SUBREG
- (MOVZX32rr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
-def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
- GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
-def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
- GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
-def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
-
-// h-register tricks.
-// For now, be conservative on x86-64 and use an h-register extract only if the
-// value is immediately zero-extended or stored, which are somewhat common
-// cases. This uses a bunch of code to prevent a register requiring a REX prefix
-// from being allocated in the same instruction as the h register, as there's
-// currently no way to describe this requirement to the register allocator.
-
-// h-register extract and zero-extend.
-def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
- (SUBREG_TO_REG
- (i64 0),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
- sub_8bit_hi)),
- sub_32bit)>;
-def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(srl GR16:$src, (i8 8)),
- (EXTRACT_SUBREG
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_16bit)>,
- Requires<[In64BitMode]>;
-def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
- (SUBREG_TO_REG
- (i64 0),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_32bit)>;
-def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
- (SUBREG_TO_REG
- (i64 0),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_32bit)>;
-
-// h-register extract and store.
-def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
- (MOV8mr_NOREX
- addr:$dst,
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
- sub_8bit_hi))>;
-def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
- (MOV8mr_NOREX
- addr:$dst,
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
- (MOV8mr_NOREX
- addr:$dst,
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-
-
-// (shl x, 1) ==> (add x, x)
-// Note that if x is undef (immediate or otherwise), we could theoretically
-// end up with the two uses of x getting different values, producing a result
-// where the least significant bit is not 0. However, the probability of this
-// happening is considered low enough that this is officially not a
-// "real problem".
-def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
-def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
-def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
-def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
-
-// Helper imms that check if a mask doesn't change significant shift bits.
-def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
-def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
-
-// Shift amount is implicitly masked.
-multiclass MaskedShiftAmountPats<SDNode frag, string name> {
- // (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR8:$src1, (and CL, immShift32)),
- (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
- def : Pat<(frag GR16:$src1, (and CL, immShift32)),
- (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
- def : Pat<(frag GR32:$src1, (and CL, immShift32)),
- (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
- def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
- (!cast<Instruction>(name # "8mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
- (!cast<Instruction>(name # "16mCL") addr:$dst)>;
- def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
- (!cast<Instruction>(name # "32mCL") addr:$dst)>;
-
- // (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, (and CL, immShift64)),
- (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
- (!cast<Instruction>(name # "64mCL") addr:$dst)>;
-}
-
-defm : MaskedShiftAmountPats<shl, "SHL">;
-defm : MaskedShiftAmountPats<srl, "SHR">;
-defm : MaskedShiftAmountPats<sra, "SAR">;
-defm : MaskedShiftAmountPats<rotl, "ROL">;
-defm : MaskedShiftAmountPats<rotr, "ROR">;
-
-// (anyext (setcc_carry)) -> (setcc_carry)
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-
-
-
-
-//===----------------------------------------------------------------------===//
-// EFLAGS-defining Patterns
-//===----------------------------------------------------------------------===//
-
-// add reg, reg
-def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
-def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
-
-// add reg, mem
-def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
- (ADD8rm GR8:$src1, addr:$src2)>;
-def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
- (ADD16rm GR16:$src1, addr:$src2)>;
-def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
- (ADD32rm GR32:$src1, addr:$src2)>;
-
-// add reg, imm
-def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
-def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
-def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
-def : Pat<(add GR16:$src1, i16immSExt8:$src2),
- (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(add GR32:$src1, i32immSExt8:$src2),
- (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
-
-// sub reg, reg
-def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
-def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
-
-// sub reg, mem
-def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
- (SUB8rm GR8:$src1, addr:$src2)>;
-def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
- (SUB16rm GR16:$src1, addr:$src2)>;
-def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
- (SUB32rm GR32:$src1, addr:$src2)>;
-
-// sub reg, imm
-def : Pat<(sub GR8:$src1, imm:$src2),
- (SUB8ri GR8:$src1, imm:$src2)>;
-def : Pat<(sub GR16:$src1, imm:$src2),
- (SUB16ri GR16:$src1, imm:$src2)>;
-def : Pat<(sub GR32:$src1, imm:$src2),
- (SUB32ri GR32:$src1, imm:$src2)>;
-def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
- (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
- (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
-
-// sub 0, reg
-def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>;
-def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
-def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
-def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
-
-// mul reg, reg
-def : Pat<(mul GR16:$src1, GR16:$src2),
- (IMUL16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(mul GR32:$src1, GR32:$src2),
- (IMUL32rr GR32:$src1, GR32:$src2)>;
-
-// mul reg, mem
-def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
- (IMUL16rm GR16:$src1, addr:$src2)>;
-def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
- (IMUL32rm GR32:$src1, addr:$src2)>;
-
-// mul reg, imm
-def : Pat<(mul GR16:$src1, imm:$src2),
- (IMUL16rri GR16:$src1, imm:$src2)>;
-def : Pat<(mul GR32:$src1, imm:$src2),
- (IMUL32rri GR32:$src1, imm:$src2)>;
-def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
- (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
- (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
-
-// reg = mul mem, imm
-def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
- (IMUL16rmi addr:$src1, imm:$src2)>;
-def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
- (IMUL32rmi addr:$src1, imm:$src2)>;
-def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
- (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
-def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
- (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
-
-// Patterns for nodes that do not produce flags, for instructions that do.
-
-// addition
-def : Pat<(add GR64:$src1, GR64:$src2),
- (ADD64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt8:$src2),
- (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(add GR64:$src1, i64immSExt32:$src2),
- (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
- (ADD64rm GR64:$src1, addr:$src2)>;
-
-// subtraction
-def : Pat<(sub GR64:$src1, GR64:$src2),
- (SUB64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
- (SUB64rm GR64:$src1, addr:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
- (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
- (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Multiply
-def : Pat<(mul GR64:$src1, GR64:$src2),
- (IMUL64rr GR64:$src1, GR64:$src2)>;
-def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
- (IMUL64rm GR64:$src1, addr:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
- (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
- (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
-def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
- (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
-def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
- (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
-
-// Increment/Decrement reg.
-// Do not make INC/DEC if it is slow
-let Predicates = [NotSlowIncDec] in {
- def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
- def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
- def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
- def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
- def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>;
- def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
- def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
- def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
-}
-
-// or reg/reg.
-def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
-def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
-def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
-
-// or reg/mem
-def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
- (OR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
- (OR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
- (OR32rm GR32:$src1, addr:$src2)>;
-def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
- (OR64rm GR64:$src1, addr:$src2)>;
-
-// or reg/imm
-def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
-def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(or GR16:$src1, i16immSExt8:$src2),
- (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(or GR32:$src1, i32immSExt8:$src2),
- (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(or GR64:$src1, i64immSExt8:$src2),
- (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(or GR64:$src1, i64immSExt32:$src2),
- (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// xor reg/reg
-def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
-def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
-def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
-
-// xor reg/mem
-def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
- (XOR8rm GR8:$src1, addr:$src2)>;
-def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
- (XOR16rm GR16:$src1, addr:$src2)>;
-def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
- (XOR32rm GR32:$src1, addr:$src2)>;
-def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
- (XOR64rm GR64:$src1, addr:$src2)>;
-
-// xor reg/imm
-def : Pat<(xor GR8:$src1, imm:$src2),
- (XOR8ri GR8:$src1, imm:$src2)>;
-def : Pat<(xor GR16:$src1, imm:$src2),
- (XOR16ri GR16:$src1, imm:$src2)>;
-def : Pat<(xor GR32:$src1, imm:$src2),
- (XOR32ri GR32:$src1, imm:$src2)>;
-def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
- (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
- (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
- (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
- (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// and reg/reg
-def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
-def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
-def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
-def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
-
-// and reg/mem
-def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
- (AND8rm GR8:$src1, addr:$src2)>;
-def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
- (AND16rm GR16:$src1, addr:$src2)>;
-def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
- (AND32rm GR32:$src1, addr:$src2)>;
-def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
- (AND64rm GR64:$src1, addr:$src2)>;
-
-// and reg/imm
-def : Pat<(and GR8:$src1, imm:$src2),
- (AND8ri GR8:$src1, imm:$src2)>;
-def : Pat<(and GR16:$src1, imm:$src2),
- (AND16ri GR16:$src1, imm:$src2)>;
-def : Pat<(and GR32:$src1, imm:$src2),
- (AND32ri GR32:$src1, imm:$src2)>;
-def : Pat<(and GR16:$src1, i16immSExt8:$src2),
- (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
-def : Pat<(and GR32:$src1, i32immSExt8:$src2),
- (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
-def : Pat<(and GR64:$src1, i64immSExt8:$src2),
- (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
-def : Pat<(and GR64:$src1, i64immSExt32:$src2),
- (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
-
-// Bit scan instruction patterns to match explicit zero-undef behavior.
-def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
-def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
-def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
-def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
-def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
-def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
-
-// When HasMOVBE is enabled it is possible to get a non-legalized
-// register-register 16 bit bswap. This maps it to a ROL instruction.
-let Predicates = [HasMOVBE] in {
- def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
-}
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various pseudo instructions used by the compiler, +// as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pattern Matching Support + +def GetLo32XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 32 bits. + return getI32Imm((unsigned)N->getZExtValue()); +}]>; + +def GetLo8XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue()); +}]>; + + +//===----------------------------------------------------------------------===// +// Random Pseudo Instructions. + +// PIC base construction. This expands to code that looks like this: +// call $next_inst +// popl %destreg" +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in + def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), + "", []>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [ESP, EFLAGS], Uses = [ESP] in { +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[NotLP64]>; +def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[NotLP64]>; +} + +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [RSP, EFLAGS], Uses = [RSP] in { +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[IsLP64]>; +def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[IsLP64]>; +} + + + +// x86-64 va_start lowering magic. +let usesCustomInserter = 1, Defs = [EFLAGS] in { +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + +// The VAARG_64 pseudo-instruction takes the address of the va_list, +// and places the address of the next argument into a register. +let Defs = [EFLAGS] in +def VAARG_64 : I<0, Pseudo, + (outs GR64:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_64 $dst, $ap, $size, $mode, $align", + [(set GR64:$dst, + (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), + (implicit EFLAGS)]>; + +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows +// targets. These calls are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in + def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), + "# dynamic stack allocation", + [(X86WinAlloca)]>; + +// When using segmented stacks these are lowered into instructions which first +// check if the current stacklet has enough free memory. If it does, memory is +// allocated by bumping the stack pointer. Otherwise memory is allocated from +// the heap. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca for segmented stacks", + [(set GR32:$dst, + (X86SegAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca for segmented stacks", + [(set GR64:$dst, + (X86SegAlloca GR64:$size))]>, + Requires<[In64BitMode]>; +} + +// The MSVC runtime contains an _ftol2 routine for converting floating-point +// to integer values. It has a strange calling convention: the input is +// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is +// used as a temporary register. No other registers (aside from flags) are +// touched. +// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 +// variant is unnecessary. + +let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { + def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), + "# win32 fptoui", + [(X86WinFTOL RFP32:$src)]>, + Requires<[Not64BitMode]>; + + def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), + "# win32 fptoui", + [(X86WinFTOL RFP64:$src)]>, + Requires<[Not64BitMode]>; +} + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let SchedRW = [WriteSystem] in { +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), + "#EH_SJLJ_SETJMP32", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[Not64BitMode]>; + def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), + "#EH_SJLJ_SETJMP64", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in { + def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), + "#EH_SJLJ_LONGJMP32", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[Not64BitMode]>; + def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), + "#EH_SJLJ_LONGJMP64", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; + } +} +} // SchedRW + +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { + def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1 in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; + def SEH_Epilogue : I<0, Pseudo, (outs), (ins), + "#SEH_Epilogue", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by segmented stacks. +// + +// This is lowered into a RET instruction by MCInstLower. We need +// this so that we don't have to have a MachineBasicBlock which ends +// with a RET and also has successors. +let isPseudo = 1 in { +def MORESTACK_RET: I<0, Pseudo, (outs), (ins), + "", []>; + +// This instruction is lowered to a RET followed by a MOV. The two +// instructions are not generated on a higher level since then the +// verifier sees a MachineBasicBlock ending with a non-terminator. +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), + "", []>; +} + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instruction mapping movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1 in +def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; + +// Other widths can also make use of the 32-bit xor, which may have a smaller +// encoding and avoid partial register updates. +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { + let AddedComplexity = 20; +} + +// Materialize i64 constant where top 32-bits are zero. This could theoretically +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however +// that would make it more difficult to rematerialize. +let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, + isCodeGenOnly = 1, hasSideEffects = 0 in +def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), + "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; + +// This 64-bit pseudo-move can be used for both a 64-bit constant that is +// actually the zero-extension of a 32-bit constant, and for labels in the +// x86-64 small code model. +def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; + +let AddedComplexity = 1 in +def : Pat<(i64 mov64imm32:$src), + (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; + +// Use sbb to materialize carry bit. +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. +// However, Pat<> can't replicate the destination reg into the inputs of the +// result. +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +} // isCodeGenOnly + + +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; + +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + +//===----------------------------------------------------------------------===// +// String Pseudo Instructions +// +let SchedRW = [WriteMicrocoded] in { +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[Not64BitMode]>; +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, + Requires<[Not64BitMode]>; +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, + Requires<[In64BitMode]>; +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, + Requires<[In64BitMode]>; +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +} + +// FIXME: Should use "(X86rep_stos AL)" as the pattern. +let Defs = [ECX,EDI], isCodeGenOnly = 1 in { + let Uses = [AL,ECX,EDI] in + def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[Not64BitMode]>; + let Uses = [AX,ECX,EDI] in + def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, + Requires<[Not64BitMode]>; + let Uses = [EAX,ECX,EDI] in + def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI], isCodeGenOnly = 1 in { + let Uses = [AL,RCX,RDI] in + def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + let Uses = [AX,RCX,RDI] in + def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, + Requires<[In64BitMode]>; + let Uses = [RAX,RCX,RDI] in + def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, + Requires<[In64BitMode]>; + + let Uses = [RAX,RCX,RDI] in + def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// + +// ELF TLS Support +// All calls clobber the non-callee saved registers. ESP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in { +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addr32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[Not64BitMode]>; +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addr32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[Not64BitMode]>; +} + +// All calls clobber the non-callee saved registers. RSP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [RSP] in { +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_addr64", + [(X86tlsaddr tls64addr:$sym)]>, + Requires<[In64BitMode]>; +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_base_addr64", + [(X86tlsbaseaddr tls64baseaddr:$sym)]>, + Requires<[In64BitMode]>; +} + +// Darwin TLS Support +// For i386, the address of the thunk is passed on the stack, on return the +// address of the variable is in %eax. %ecx is trashed during the function +// call. All other registers are preserved. +let Defs = [EAX, ECX, EFLAGS], + Uses = [ESP], + usesCustomInserter = 1 in +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLSCall_32", + [(X86TLSCall addr:$sym)]>, + Requires<[Not64BitMode]>; + +// For x86_64, the address of the thunk is passed in %rdi, on return +// the address of the variable is in %rax. All other registers are preserved. +let Defs = [RAX, EFLAGS], + Uses = [RSP, RDI], + usesCustomInserter = 1 in +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLSCall_64", + [(X86TLSCall addr:$sym)]>, + Requires<[In64BitMode]>; + + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions + +// X86 doesn't have 8-bit conditional moves. Use a customInserter to +// emit control flow. An alternative to this is to mark i8 SELECT as Promote, +// however that requires promoting the operands, and can induce additional +// i8 register pressure. +let usesCustomInserter = 1, Uses = [EFLAGS] in { +def CMOV_GR8 : I<0, Pseudo, + (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), + "#CMOV_GR8 PSEUDO!", + [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, + imm:$cond, EFLAGS))]>; + +let Predicates = [NoCMov] in { +def CMOV_GR32 : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), + "#CMOV_GR32* PSEUDO!", + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; +def CMOV_GR16 : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), + "#CMOV_GR16* PSEUDO!", + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; +} // Predicates = [NoCMov] + +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE1. +let Predicates = [FPStackf32] in +def CMOV_RFP32 : I<0, Pseudo, + (outs RFP32:$dst), + (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), + "#CMOV_RFP32 PSEUDO!", + [(set RFP32:$dst, + (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, + EFLAGS))]>; +// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no +// SSE2. +let Predicates = [FPStackf64] in +def CMOV_RFP64 : I<0, Pseudo, + (outs RFP64:$dst), + (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), + "#CMOV_RFP64 PSEUDO!", + [(set RFP64:$dst, + (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, + EFLAGS))]>; +def CMOV_RFP80 : I<0, Pseudo, + (outs RFP80:$dst), + (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), + "#CMOV_RFP80 PSEUDO!", + [(set RFP80:$dst, + (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, + EFLAGS))]>; +} // UsesCustomInserter = 1, Uses = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Use normal instructions and add lock prefix dynamically. + +// Memory barriers + +// TODO: Get this to fold the constant into the instruction. +let isCodeGenOnly = 1, Defs = [EFLAGS] in +def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), + "or{l}\t{$zero, $dst|$dst, $zero}", + [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK, + Sched<[WriteALULd, WriteRMW]>; + +let hasSideEffects = 1 in +def Int_MemBarrier : I<0, Pseudo, (outs), (ins), + "#MEMBARRIER", + [(X86MemBarrier)]>, Sched<[WriteLoad]>; + +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize16, LOCK; +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize32, LOCK; +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; + +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize16, LOCK; + +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize32, LOCK; + +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize16, LOCK; +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize32, LOCK; +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +} + +} + +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; + +// Optimized codegen when the non-memory output is not used. +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), + !strconcat(mnemonic, "{b}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), + !strconcat(mnemonic, "{w}\t$dst"), + [], IIC_UNARY_MEM>, OpSize16, LOCK; +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), + !strconcat(mnemonic, "{l}\t$dst"), + [], IIC_UNARY_MEM>, OpSize32, LOCK; +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), + !strconcat(mnemonic, "{q}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +} +} + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; + +// Atomic compare and swap. +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, + SDPatternOperator frag, X86MemOperand x86memop, + InstrItinClass itin> { +let isCodeGenOnly = 1 in { + def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), + !strconcat(mnemonic, "\t$ptr"), + [(frag addr:$ptr)], itin>, TB, LOCK; +} +} + +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic, SDPatternOperator frag, + InstrItinClass itin8, InstrItinClass itin> { +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { + let Defs = [AL, EFLAGS], Uses = [AL] in + def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), + !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK; + let Defs = [AX, EFLAGS], Uses = [AX] in + def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), + !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK; + let Defs = [EAX, EFLAGS], Uses = [EAX] in + def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), + !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK; + let Defs = [RAX, EFLAGS], Uses = [RAX] in + def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), + !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK; +} +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", + X86cas8, i64mem, + IIC_CMPX_LOCK_8B>; +} + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", + X86cas16, i128mem, + IIC_CMPX_LOCK_16B>, REX_W; +} + +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", + X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; + +// Atomic exchange and add +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, + string frag, + InstrItinClass itin8, InstrItinClass itin> { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin8>; + def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize16; + def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>, OpSize32; + def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", + IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, + TB, LOCK; + +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI<string op> { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; +} +defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; +defm RELEASE_AND : RELEASE_BINOP_MI<"and">; +defm RELEASE_OR : RELEASE_BINOP_MI<"or">; +defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; +// Note: we don't deal with sub, because substractions of constants are +// optimized into additions before this code can run + +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; +defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; + +def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_8 addr:$dst, GR8 :$src)]>; +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_16 addr:$dst, GR16:$src)]>; +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_32 addr:$dst, GR32:$src)]>; +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_64 addr:$dst, GR64:$src)]>; + +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions. +//===----------------------------------------------------------------------===// + +// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +let Uses = [EFLAGS], usesCustomInserter = 1 in { + def CMOV_FR32 : I<0, Pseudo, + (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), + "#CMOV_FR32 PSEUDO!", + [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, + EFLAGS))]>; + def CMOV_FR64 : I<0, Pseudo, + (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), + "#CMOV_FR64 PSEUDO!", + [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, + EFLAGS))]>; + def CMOV_V4F32 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V4F32 PSEUDO!", + [(set VR128:$dst, + (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2F64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2F64 PSEUDO!", + [(set VR128:$dst, + (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2I64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2I64 PSEUDO!", + [(set VR128:$dst, + (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8F32 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V8F32 PSEUDO!", + [(set VR256:$dst, + (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4F64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4F64 PSEUDO!", + [(set VR256:$dst, + (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4I64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4I64 PSEUDO!", + [(set VR256:$dst, + (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8I64 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V8I64 PSEUDO!", + [(set VR512:$dst, + (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8F64 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V8F64 PSEUDO!", + [(set VR512:$dst, + (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V16F32 : I<0, Pseudo, + (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), + "#CMOV_V16F32 PSEUDO!", + [(set VR512:$dst, + (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, + EFLAGS)))]>; +} + + +//===----------------------------------------------------------------------===// +// DAG Pattern Matching Rules +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; +def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; + +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), + (ADD32ri GR32:$src1, tconstpool:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), + (ADD32ri GR32:$src1, tjumptable:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), + (ADD32ri GR32:$src1, tglobaladdr:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), + (ADD32ri GR32:$src1, texternalsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), + (ADD32ri GR32:$src1, tblockaddress:$src2)>; + +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV32mi addr:$dst, tglobaladdr:$src)>; +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV32mi addr:$dst, texternalsym:$src)>; +def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV32mi addr:$dst, tblockaddress:$src)>; + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small +// code model mode, should use 'movabs'. FIXME: This is really a hack, the +// 'movabs' predicate should handle this sort of thing. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; + +// In kernel code model, we can get the address of a label +// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; + +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsStatic]>; + +def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; + +// Calls + +// tls has some funny stuff here... +// This corresponds to movabs $foo@tpoff, %rax +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), + (MOV64ri32 tglobaltlsaddr :$dst)>; +// This corresponds to add $foo@tpoff, %rax +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), + (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; + + +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they +// can never use callee-saved registers. That is the purpose of the GR64_TC +// register classes. +// +// The only volatile register that is never used by the calling convention is +// %r11. This happens when calling a vararg function with 6 arguments. +// +// Match an X86tcret that uses less than 7 volatile registers. +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 0; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) + return false; + return true; +}]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode]>; + +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi addr:$dst, imm:$off)>, + Requires<[Not64BitMode, IsNotPIC]>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), + (TCRETURNdi tglobaladdr:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +// Don't fold loads into X86tcret requiring more than 6 regs. +// There wouldn't be enough scratch registers for base+index. +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), + (TCRETURNmi64 addr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), + (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, + Requires<[IsLP64]>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), + (TCRETURNdi64 texternalsym:$dst, imm:$off)>, + Requires<[IsLP64]>; + +// Normal calls, with various flavors of addresses. +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; +def : Pat<(X86call (i32 imm:$dst)), + (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86cmp GR64:$src1, 0), + (TEST64rr GR64:$src1, GR64:$src1)>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, + Instruction Inst64> { + let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), + (Inst16 GR16:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), + (Inst32 GR32:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), + (Inst64 GR64:$src2, addr:$src1)>; + } +} + +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; + +// zextload bool -> zextload byte +def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +// extload bool -> extload byte +// When extloading from 16-bit and smaller memory locations into 64-bit +// registers, use zero-extending loads so that the entire 64-bit register is +// defined, avoiding partial-register updates. + +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +// For other extloads, use subregs, since the high contents of the register are +// defined after an extload. +def : Pat<(extloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; + +// anyext. Define these to do an explicit zero-extend to +// avoid partial-register updates. +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; + +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. +def : Pat<(i32 (anyext GR16:$src)), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; + +def : Pat<(i64 (anyext GR8 :$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. And x86's cmov doesn't do anything if the +// condition is false. But any other 32-bit operation will zero-extend +// up to 64 bits. +def def32 : PatLeaf<(i32 GR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != ISD::AssertSext && + N->getOpcode() != X86ISD::CMOV; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + +//===----------------------------------------------------------------------===// +// Pattern match OR as ADD +//===----------------------------------------------------------------------===// + +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be +// 3-addressified into an LEA instruction to avoid copies. However, we also +// want to finally emit these instructions as an or at the end of the code +// generator to make the generated code easier to read. To do this, we select +// into "disjoint bits" pseudo ops. + +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) + return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); + + APInt KnownZero0, KnownOne0; + CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0); + APInt KnownZero1, KnownOne1; + CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0); + return (~KnownZero0 & ~KnownZero1) == 0; +}]>; + + +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. +// Try this before the selecting to OR. +let AddedComplexity = 5, SchedRW = [WriteALU] in { + +let isConvertibleToThreeAddress = 1, + Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { +def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "", // orw/addw REG, REG + [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; +def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "", // orl/addl REG, REG + [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; +def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "", // orq/addq REG, REG + [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable + +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + +def ADD16ri8_DB : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "", // orw/addw REG, imm8 + [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; +def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "", // orw/addw REG, imm + [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; + +def ADD32ri8_DB : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "", // orl/addl REG, imm8 + [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; +def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "", // orl/addl REG, imm + [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; + + +def ADD64ri8_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "", // orq/addq REG, imm8 + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt8:$src2))]>; +def ADD64ri32_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "", // orq/addq REG, imm + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt32:$src2))]>; +} +} // AddedComplexity, SchedRW + + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// Odd encoding trick: -128 fits into an 8-bit immediate field while +// +128 doesn't, so in this special case use a sub instead of an add. +def : Pat<(add GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), + (SUB16mi8 addr:$dst, -128)>; + +def : Pat<(add GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), + (SUB32mi8 addr:$dst, -128)>; + +def : Pat<(add GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), + (SUB64mi8 addr:$dst, -128)>; + +// The same trick applies for 32-bit immediate fields in 64-bit +// instructions. +def : Pat<(add GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; +def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), + (SUB64mi32 addr:$dst, 0xffffffff80000000)>; + +// To avoid needing to materialize an immediate in a register, use a 32-bit and +// with implicit zero-extension instead of a 64-bit and if the immediate has at +// least 32 bits of leading zeros. If in addition the last 32 bits can be +// represented with a sign extension of a 8 bit constant, use that. + +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri8 + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo8XForm imm:$imm))), + sub_32bit)>; + +def : Pat<(and GR64:$src, i64immZExt32:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo32XForm imm:$imm))), + sub_32bit)>; + + +// r & (2^16-1) ==> movz +def : Pat<(and GR32:$src1, 0xffff), + (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, + GR32_ABCD)), + sub_8bit))>, + Requires<[Not64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, + Requires<[Not64BitMode]>; + +// r & (2^32-1) ==> movz +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), + (SUBREG_TO_REG (i64 0), + (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), + sub_32bit)>; +// r & (2^16-1) ==> movz +def : Pat<(and GR64:$src, 0xffff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), + sub_32bit)>; +// r & (2^8-1) ==> movz +def : Pat<(and GR64:$src, 0xff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), + sub_32bit)>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, + Requires<[In64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, + Requires<[In64BitMode]>; + + +// sext_inreg patterns +def : Pat<(sext_inreg GR32:$src, i16), + (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit))>, + Requires<[Not64BitMode]>; + +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, + Requires<[Not64BitMode]>; + +def : Pat<(sext_inreg GR64:$src, i32), + (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +def : Pat<(sext_inreg GR64:$src, i16), + (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; +def : Pat<(sext_inreg GR64:$src, i8), + (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, + Requires<[In64BitMode]>; +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, + Requires<[In64BitMode]>; + +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; + +// trunc patterns +def : Pat<(i16 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_32bit)>; +def : Pat<(i16 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_8bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_8bit)>, + Requires<[In64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG GR16:$src, sub_8bit)>, + Requires<[In64BitMode]>; + +// h-register tricks +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32rr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; + +// h-register tricks. +// For now, be conservative on x86-64 and use an h-register extract only if the +// value is immediately zero-extended or stored, which are somewhat common +// cases. This uses a bunch of code to prevent a register requiring a REX prefix +// from being allocated in the same instruction as the h register, as there's +// currently no way to describe this requirement to the register allocator. + +// h-register extract and zero-extend. +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; + +// h-register extract and store. +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi))>; +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; + + +// (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// Helper imms that check if a mask doesn't change significant shift bits. +def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>; +def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>; + +// Shift amount is implicitly masked. +multiclass MaskedShiftAmountPats<SDNode frag, string name> { + // (shift x (and y, 31)) ==> (shift x, y) + def : Pat<(frag GR8:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "8rCL") GR8:$src1)>; + def : Pat<(frag GR16:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "16rCL") GR16:$src1)>; + def : Pat<(frag GR32:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "32rCL") GR32:$src1)>; + def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "8mCL") addr:$dst)>; + def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "16mCL") addr:$dst)>; + def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "32mCL") addr:$dst)>; + + // (shift x (and y, 63)) ==> (shift x, y) + def : Pat<(frag GR64:$src1, (and CL, immShift64)), + (!cast<Instruction>(name # "64rCL") GR64:$src1)>; + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (!cast<Instruction>(name # "64mCL") addr:$dst)>; +} + +defm : MaskedShiftAmountPats<shl, "SHL">; +defm : MaskedShiftAmountPats<srl, "SHR">; +defm : MaskedShiftAmountPats<sra, "SAR">; +defm : MaskedShiftAmountPats<rotl, "ROL">; +defm : MaskedShiftAmountPats<rotr, "ROR">; + +// (anyext (setcc_carry)) -> (setcc_carry) +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; + + + + +//===----------------------------------------------------------------------===// +// EFLAGS-defining Patterns +//===----------------------------------------------------------------------===// + +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; + +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), + (ADD8rm GR8:$src1, addr:$src2)>; +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), + (ADD16rm GR16:$src1, addr:$src2)>; +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; + +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), + (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(add GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; + +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), + (SUB8rm GR8:$src1, addr:$src2)>; +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), + (SUB16rm GR16:$src1, addr:$src2)>; +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; + +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), + (SUB8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, imm:$src2), + (SUB16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), + (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub 0, reg +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; + +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), + (IMUL16rr GR16:$src1, GR16:$src2)>; +def : Pat<(mul GR32:$src1, GR32:$src2), + (IMUL32rr GR32:$src1, GR32:$src2)>; + +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), + (IMUL16rm GR16:$src1, addr:$src2)>; +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), + (IMUL32rm GR32:$src1, addr:$src2)>; + +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), + (IMUL16rri GR16:$src1, imm:$src2)>; +def : Pat<(mul GR32:$src1, imm:$src2), + (IMUL32rri GR32:$src1, imm:$src2)>; +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), + (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), + (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; + +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), + (IMUL16rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), + (IMUL32rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), + (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), + (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; + +// Patterns for nodes that do not produce flags, for instructions that do. + +// addition +def : Pat<(add GR64:$src1, GR64:$src2), + (ADD64rr GR64:$src1, GR64:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; + +// subtraction +def : Pat<(sub GR64:$src1, GR64:$src2), + (SUB64rr GR64:$src1, GR64:$src2)>; +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Multiply +def : Pat<(mul GR64:$src1, GR64:$src2), + (IMUL64rr GR64:$src1, GR64:$src2)>; +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), + (IMUL64rm GR64:$src1, addr:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), + (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), + (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), + (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), + (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; + +// Increment/Decrement reg. +// Do not make INC/DEC if it is slow +let Predicates = [NotSlowIncDec] in { + def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; + def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; + def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; + def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; + def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; + def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; + def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +} + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), + (OR8rm GR8:$src1, addr:$src2)>; +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), + (OR16rm GR16:$src1, addr:$src2)>; +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), + (OR32rm GR32:$src1, addr:$src2)>; +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), + (OR64rm GR64:$src1, addr:$src2)>; + +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), + (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(or GR32:$src1, i32immSExt8:$src2), + (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), + (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt32:$src2), + (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), + (XOR8rm GR8:$src1, addr:$src2)>; +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), + (XOR16rm GR16:$src1, addr:$src2)>; +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), + (XOR32rm GR32:$src1, addr:$src2)>; +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), + (XOR64rm GR64:$src1, addr:$src2)>; + +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, imm:$src2), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(xor GR32:$src1, imm:$src2), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), + (AND8rm GR8:$src1, addr:$src2)>; +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), + (AND16rm GR16:$src1, addr:$src2)>; +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), + (AND32rm GR32:$src1, addr:$src2)>; +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), + (AND64rm GR64:$src1, addr:$src2)>; + +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), + (AND8ri GR8:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, imm:$src2), + (AND16ri GR16:$src1, imm:$src2)>; +def : Pat<(and GR32:$src1, imm:$src2), + (AND32ri GR32:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, i16immSExt8:$src2), + (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(and GR32:$src1, i32immSExt8:$src2), + (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt8:$src2), + (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt32:$src2), + (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Bit scan instruction patterns to match explicit zero-undef behavior. +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; + +// When HasMOVBE is enabled it is possible to get a non-legalized +// register-register 16 bit bswap. This maps it to a ROL instruction. +let Predicates = [HasMOVBE] in { + def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; +} diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index c8d5c591ba9..765417f64a8 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1804,58 +1804,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, return false; } -int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); - - if (MI->getOpcode() == getCallFrameSetupOpcode() || - MI->getOpcode() == getCallFrameDestroyOpcode()) { - unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * - StackAlign; - - SPAdj -= MI->getOperand(1).getImm(); - - if (MI->getOpcode() == getCallFrameSetupOpcode()) - return SPAdj; - else - return -SPAdj; - } - - // To know whether a call adjusts the stack, we need information - // that is bound to the following ADJCALLSTACKUP pseudo. - // Look for the next ADJCALLSTACKUP that follows the call. - if (MI->isCall()) { - const MachineBasicBlock* MBB = MI->getParent(); - auto I = ++MachineBasicBlock::const_iterator(MI); - for (auto E = MBB->end(); I != E; ++I) { - if (I->getOpcode() == getCallFrameDestroyOpcode() || - I->isCall()) - break; - } - - // If we could not find a frame destroy opcode, then it has already - // been simplified, so we don't care. - if (I->getOpcode() != getCallFrameDestroyOpcode()) - return 0; - - return -(I->getOperand(1).getImm()); - } - - // Currently handle only PUSHes we can reasonably expect to see - // in call sequences - switch (MI->getOpcode()) { - default: - return 0; - case X86::PUSH32i8: - case X86::PUSH32r: - case X86::PUSH32rmm: - case X86::PUSH32rmr: - case X86::PUSHi32: - return 4; - } -} - /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 4d15467f0ca..5662e86932c 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -175,11 +175,6 @@ public: /// const X86RegisterInfo &getRegisterInfo() const { return RI; } - /// getSPAdjust - This returns the stack pointer adjustment made by - /// this instruction. For x86, we need to handle more complex call - /// sequences involving PUSHes. - int getSPAdjust(const MachineInstr *MI) const override; - /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" /// extension instruction. That is, it's like a copy where it's legal for the /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 9fd03a7059c..b23a744da68 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -77,9 +77,6 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackSize; /// NumLocalDynamics - Number of local-dynamic TLS accesses. unsigned NumLocalDynamics; - /// HasPushSequences - Keeps track of whether this function uses sequences - /// of pushes to pass function parameters. - bool HasPushSequences; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers @@ -100,8 +97,7 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0), - HasPushSequences(false) {} + NumLocalDynamics(0) {} explicit X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), @@ -117,15 +113,11 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0), - HasPushSequences(false) {} + NumLocalDynamics(0) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } - bool getHasPushSequences() const { return HasPushSequences; } - void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } - bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } void setRestoreBasePointer(const MachineFunction *MF); int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 0fa38f45370..09e651cebfb 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -468,6 +468,8 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); @@ -504,9 +506,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } else FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); - if (BasePtr == StackPtr) - FIOffset += SPAdj; - // The frame index format for stackmaps and patchpoints is different from the // X86 format. It only has a FI and an offset. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 78fe430c7f6..3675186e8a1 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -193,7 +193,6 @@ public: void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; - void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; }; @@ -227,10 +226,6 @@ bool X86PassConfig::addILPOpts() { return true; } -void X86PassConfig::addPreRegAlloc() { - addPass(createX86CallFrameOptimization()); -} - void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); } diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll index cc11ab31c5d..b56f24d9962 100644 --- a/llvm/test/CodeGen/X86/inalloca-invoke.ll +++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll @@ -31,7 +31,7 @@ blah: to label %invoke.cont unwind label %lpad ; Uses end as sret param. -; CHECK: pushl %[[end]] +; CHECK: movl %[[end]], (%esp) ; CHECK: calll _plus invoke.cont: diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll index d7ec2d0fe90..cb48ed747be 100644 --- a/llvm/test/CodeGen/X86/movtopush.ll +++ b/llvm/test/CodeGen/X86/movtopush.ll @@ -1,65 +1,24 @@ ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL -; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED - declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) ; Here, we should have a reserved frame, so we don't expect pushes -; NORMAL-LABEL: test1: +; NORMAL-LABEL: test1 ; NORMAL: subl $16, %esp ; NORMAL-NEXT: movl $4, 12(%esp) ; NORMAL-NEXT: movl $3, 8(%esp) ; NORMAL-NEXT: movl $2, 4(%esp) ; NORMAL-NEXT: movl $1, (%esp) ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp define void @test1() { entry: call void @good(i32 1, i32 2, i32 3, i32 4) ret void } -; We're optimizing for code size, so we should get pushes for x86, -; even though there is a reserved call frame. -; Make sure we don't touch x86-64 -; NORMAL-LABEL: test1b: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -; X64-LABEL: test1b: -; X64: movl $1, %ecx -; X64-NEXT: movl $2, %edx -; X64-NEXT: movl $3, %r8d -; X64-NEXT: movl $4, %r9d -; X64-NEXT: callq good -define void @test1b() optsize { -entry: - call void @good(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; Same as above, but for minsize -; NORMAL-LABEL: test1c: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test1c() minsize { -entry: - call void @good(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; If we have a reserved frame, we should have pushes -; NORMAL-LABEL: test2: +; Here, we expect a sequence of 4 immediate pushes +; NORMAL-LABEL: test2 ; NORMAL-NOT: subl {{.*}} %esp ; NORMAL: pushl $4 ; NORMAL-NEXT: pushl $3 @@ -75,53 +34,53 @@ entry: ; Again, we expect a sequence of 4 immediate pushes ; Checks that we generate the right pushes for >8bit immediates -; NORMAL-LABEL: test2b: +; NORMAL-LABEL: test2b ; NORMAL-NOT: subl {{.*}} %esp ; NORMAL: pushl $4096 ; NORMAL-NEXT: pushl $3072 ; NORMAL-NEXT: pushl $2048 ; NORMAL-NEXT: pushl $1024 ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test2b() optsize { +define void @test2b(i32 %k) { entry: + %a = alloca i32, i32 %k call void @good(i32 1024, i32 2048, i32 3072, i32 4096) ret void } ; The first push should push a register -; NORMAL-LABEL: test3: +; NORMAL-LABEL: test3 ; NORMAL-NOT: subl {{.*}} %esp ; NORMAL: pushl $4 ; NORMAL-NEXT: pushl $3 ; NORMAL-NEXT: pushl $2 ; NORMAL-NEXT: pushl %e{{..}} ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test3(i32 %k) optsize { +define void @test3(i32 %k) { entry: + %a = alloca i32, i32 %k call void @good(i32 %k, i32 2, i32 3, i32 4) ret void } ; We don't support weird calling conventions -; NORMAL-LABEL: test4: +; NORMAL-LABEL: test4 ; NORMAL: subl $12, %esp ; NORMAL-NEXT: movl $4, 8(%esp) ; NORMAL-NEXT: movl $3, 4(%esp) ; NORMAL-NEXT: movl $1, (%esp) ; NORMAL-NEXT: movl $2, %eax ; NORMAL-NEXT: call -; NORMAL-NEXT: addl $12, %esp -define void @test4() optsize { +define void @test4(i32 %k) { entry: + %a = alloca i32, i32 %k call void @inreg(i32 1, i32 2, i32 3, i32 4) ret void } -; When there is no reserved call frame, check that additional alignment -; is added when the pushes don't add up to the required alignment. -; ALIGNED-LABEL: test5: +; Check that additional alignment is added when the pushes +; don't add up to the required alignment. +; ALIGNED-LABEL: test5 ; ALIGNED: subl $16, %esp ; ALIGNED-NEXT: pushl $4 ; ALIGNED-NEXT: pushl $3 @@ -138,7 +97,7 @@ entry: ; Check that pushing the addresses of globals (Or generally, things that ; aren't exactly immediates) isn't broken. ; Fixes PR21878. -; NORMAL-LABEL: test6: +; NORMAL-LABEL: test6 ; NORMAL: pushl $_ext ; NORMAL-NEXT: call declare void @f(i8*) @@ -151,108 +110,3 @@ bb: alloca i32 ret void } - -; Check that we fold simple cases into the push -; NORMAL-LABEL: test7: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: movl 4(%esp), [[EAX:%e..]] -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl ([[EAX]]) -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test7(i32* %ptr) optsize { -entry: - %val = load i32* %ptr - call void @good(i32 1, i32 2, i32 %val, i32 4) - ret void -} - -; But we don't want to fold stack-relative loads into the push, -; because the offset will be wrong -; NORMAL-LABEL: test8: -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: movl 4(%esp), [[EAX:%e..]] -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl [[EAX]] -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test8(i32* %ptr) optsize { -entry: - %val = ptrtoint i32* %ptr to i32 - call void @good(i32 1, i32 2, i32 %val, i32 4) - ret void -} - -; If one function is using push instructions, and the other isn't -; (because it has frame-index references), then we must resolve -; these references correctly. -; NORMAL-LABEL: test9: -; NORMAL-NOT: leal (%esp), -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -; NORMAL-NEXT: subl $16, %esp -; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]] -; NORMAL-NEXT: movl [[EAX]], 12(%esp) -; NORMAL-NEXT: movl $7, 8(%esp) -; NORMAL-NEXT: movl $6, 4(%esp) -; NORMAL-NEXT: movl $5, (%esp) -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -define void @test9() optsize { -entry: - %p = alloca i32, align 4 - call void @good(i32 1, i32 2, i32 3, i32 4) - %0 = ptrtoint i32* %p to i32 - call void @good(i32 5, i32 6, i32 7, i32 %0) - ret void -} - -; We can end up with an indirect call which gets reloaded on the spot. -; Make sure we reference the correct stack slot - we spill into (%esp) -; and reload from 16(%esp) due to the pushes. -; NORMAL-LABEL: test10: -; NORMAL: movl $_good, [[ALLOC:.*]] -; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]] -; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill -; NORMAL: nop -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: calll *16(%esp) -; NORMAL-NEXT: addl $16, %esp -define void @test10() optsize { - %stack_fptr = alloca void (i32, i32, i32, i32)* - store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr - %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr - call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"() - call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; We can't fold the load from the global into the push because of -; interference from the store -; NORMAL-LABEL: test11: -; NORMAL: movl _the_global, [[EAX:%e..]] -; NORMAL-NEXT: movl $42, _the_global -; NORMAL-NEXT: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl [[EAX]] -; NORMAL-NEXT: call -; NORMAL-NEXT: addl $16, %esp -@the_global = external global i32 -define void @test11() optsize { - %myload = load i32* @the_global - store i32 42, i32* @the_global - call void @good(i32 %myload, i32 2, i32 3, i32 4) - ret void -} |