diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/include/llvm/Target/TargetLowering.h | 9 | ||||
| -rw-r--r-- | llvm/include/llvm/Target/TargetOptions.h | 5 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 48 | ||||
| -rw-r--r-- | llvm/lib/Target/TargetMachine.cpp | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/README.txt | 80 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86CallingConv.td | 50 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 511 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 43 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrX86-64.td | 35 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86MachineFunctionInfo.h | 15 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86RegisterInfo.cpp | 129 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/tailcall1.ll | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/tailcallpic1.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/tailcallpic2.ll | 12 | 
16 files changed, 928 insertions, 65 deletions
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h index 8586d7f0916..1352eaff732 100644 --- a/llvm/include/llvm/Target/TargetLowering.h +++ b/llvm/include/llvm/Target/TargetLowering.h @@ -860,6 +860,15 @@ public:    /// implement this.  The default implementation of this aborts.    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); +  /// IsEligibleForTailCallOptimization - Check whether the call is eligible for +  /// tail call optimization. Target which want to do tail call optimization +  /// should implement this function.  +  virtual bool IsEligibleForTailCallOptimization(SDOperand Call,  +                                                 SDOperand Ret,  +                                                 SelectionDAG &DAG) const { +    return false; +  } +    /// CustomPromoteOperation - This callback is invoked for operations that are    /// unsupported by the target, are registered to use 'custom' lowering, and    /// whose type needs to be promoted. diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index 7421f96b7bc..dd544324420 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -73,6 +73,11 @@ namespace llvm {    /// ExceptionHandling - This flag indicates that exception information should    /// be emitted.    extern bool ExceptionHandling; + +  /// PerformTailCallOpt - This flag is enabled when the -tailcallopt is +  /// specified on the commandline. When the flag is on, the target will perform +  /// tail call optimization (pop the caller's stack) providing it supports it. +  extern bool PerformTailCallOpt;  } // End llvm namespace  #endif diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 5f321654d86..b1bf475c473 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -4444,6 +4444,48 @@ static void copyCatchInfo(BasicBlock *SrcBB, BasicBlock *DestBB,      }  } +/// CheckDAGForTailCallsAndFixThem - This Function looks for CALL nodes in the +/// DAG and fixes their tailcall attribute operand +static void CheckDAGForTailCallsAndFixThem(SelectionDAG &DAG,  +                                           TargetLowering& TLI) { +  SDNode * Ret = NULL; +  SDOperand Terminator = DAG.getRoot(); + +  // Find RET node. +  if (Terminator.getOpcode() == ISD::RET) { +    Ret = Terminator.Val; +  } +  +  // Fix tail call attribute of CALL nodes. +  for (SelectionDAG::allnodes_iterator BE = DAG.allnodes_begin(), +         BI = prior(DAG.allnodes_end()); BI != BE; --BI) { +    if (BI->getOpcode() == ISD::CALL) { +      SDOperand OpRet(Ret, 0); +      SDOperand OpCall(static_cast<SDNode*>(BI), 0); +      bool isMarkedTailCall =  +        cast<ConstantSDNode>(OpCall.getOperand(3))->getValue() != 0; +      // If CALL node has tail call attribute set to true and the call is not +      // eligible (no RET or the target rejects) the attribute is fixed to +      // false.  The TargetLowering::IsEligibleForTailCallOptimization function +      // must correctly identify tail call optimizable calls. +      if (isMarkedTailCall &&  +          (Ret==NULL ||  +           !TLI.IsEligibleForTailCallOptimization(OpCall, OpRet, DAG))) { +        SmallVector<SDOperand, 32> Ops; +        unsigned idx=0; +        for(SDNode::op_iterator I =OpCall.Val->op_begin(),  +              E=OpCall.Val->op_end(); I!=E; I++, idx++) { +          if (idx!=3) +            Ops.push_back(*I); +          else  +            Ops.push_back(DAG.getConstant(false, TLI.getPointerTy())); +        } +        DAG.UpdateNodeOperands(OpCall, Ops.begin(), Ops.size()); +      } +    } +  } +} +  void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB,         std::vector<std::pair<MachineInstr*, unsigned> > &PHINodesToUpdate,                                           FunctionLoweringInfo &FuncInfo) { @@ -4621,6 +4663,12 @@ void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB,    // Make sure the root of the DAG is up-to-date.    DAG.setRoot(SDL.getRoot()); + +  // Check whether calls in this block are real tail calls. Fix up CALL nodes +  // with correct tailcall attribute so that the target can rely on the tailcall +  // attribute indicating whether the call is really eligible for tail call +  // optimization. +  CheckDAGForTailCallsAndFixThem(DAG, TLI);  }  void SelectionDAGISel::CodeGenAndEmitDAG(SelectionDAG &DAG) { diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index 6c00a3f492b..9caea11dd39 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -33,6 +33,7 @@ namespace llvm {    bool ExceptionHandling;    Reloc::Model RelocationModel;    CodeModel::Model CMModel; +  bool PerformTailCallOpt;  }  namespace {    cl::opt<bool, true> PrintCode("print-machineinstrs", @@ -116,6 +117,12 @@ namespace {        clEnumValN(CodeModel::Large, "large",                   "  Large code model"),        clEnumValEnd)); + +  cl::opt<bool, true> +  EnablePerformTailCallOpt("tailcallopt", +                           cl::desc("Turn on tail call optimization."), +                           cl::location(PerformTailCallOpt), +                           cl::init(false));  }  //--------------------------------------------------------------------------- diff --git a/llvm/lib/Target/X86/README.txt b/llvm/lib/Target/X86/README.txt index 9bafff73d5c..0d4dce32d83 100644 --- a/llvm/lib/Target/X86/README.txt +++ b/llvm/lib/Target/X86/README.txt @@ -1368,3 +1368,83 @@ L7:  L5:  //===---------------------------------------------------------------------===// +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place if +that was a not tail call optimized functiong call ) before moving them +to actual stack slot. this is done to prevent overwriting of paramters +(see example below) that might be used, since the arguments of the +callee overwrites callers arguments. + + example:   + +int callee(int32, int64);  +int caller(int32 arg1, int32 arg2) {  +  int64 local = arg2 * 2;  +  return callee(arg2, (int64)local);  +} + +[arg1]          [!arg2 no longer valid since we moved local onto it] +[arg2]      ->  [(int64) +[RETADDR]        local  ] + +moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + - only push those arguments to the top of the stack that are actual +   parameters of the caller function and have no local value in the +   caller + +   in above example local does not need to be pushed onto the top of +   the stack as it is definitetly not a caller's function parameter + + - analyse the actual parameters of the callee to see which would +   overwrite a caller paramter which is used by the callee and only +   push them onto the top of the stack + +   int callee (int32 arg1, int32 arg2); +   int caller (int32 arg1, int32 arg2) { +       return callee(arg1,arg2); +   } + +   here we don't need to write any variables to the top of the stack +   since they don't overwrite each other + +   int callee (int32 arg1, int32 arg2); +   int caller (int32 arg1, int32 arg2) { +       return callee(arg2,arg1); +   } + +   here we need to push the arguments because they overwrite each other + + +   code for lowering directly onto callers arguments: ++  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; ++  SmallVector<SDOperand, 8> MemOpChains; ++ ++  SDOperand FramePtr; ++  SDOperand PtrOff; ++  SDOperand FIN; ++  int FI = 0; ++  // Walk the register/memloc assignments, inserting copies/loads. ++  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { ++    CCValAssign &VA = ArgLocs[i]; ++    SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); ++     ++    .... ++     ++    if (VA.isRegLoc()) { ++      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); ++    } else { ++      assert(VA.isMemLoc()); ++      // create frame index ++      int32_t Offset = VA.getLocMemOffset()+FPDiff; ++      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; ++      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); ++      FIN = DAG.getFrameIndex(FI, MVT::i32); ++      // store relative to framepointer ++      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0)); ++    } ++  } +//===---------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 9c2d95a1991..f23e5806563 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[    CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>  ]>; +// tail call convetion (fast) one register is reserved for target address +// namely R9 +def CC_X86_64_TailCall : CallingConv<[ +  // Promote i8/i16 arguments to i32. +  CCIfType<[i8, i16], CCPromoteToType<i32>>, +   +  CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>, + +  // The first 6 integer arguments are passed in integer registers. +  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>, +  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, +   +  // The first 8 FP/Vector arguments are passed in XMM registers. +  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], +              CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>, + +  // The first 8 MMX vector arguments are passed in GPRs. +  CCIfType<[v8i8, v4i16, v2i32, v1i64], +              CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, + +  // The 'nest' parameter, if any, is passed in R10. +  CCIfNest<CCAssignToReg<[R10]>>, + +  // Integer/FP values get stored in stack slots that are 8 bytes in size and +  // 8-byte aligned if there are no more registers to hold them. +  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, +   +  // Vectors get 16-byte stack slots that are 16-byte aligned. +  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + +  // __m64 vectors get 8-byte stack slots that are 8-byte aligned. +  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> +]>; +  //===----------------------------------------------------------------------===//  // X86 C Calling Convention @@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[    CCDelegateTo<CC_X86_32_Common>  ]>; +/// Same as C calling convention up to nonfree ECX which is used for storing  +/// potential pointer to tail called function +def CC_X86_32_TailCall : CallingConv<[ +  // Promote i8/i16 arguments to i32. +  CCIfType<[i8, i16], CCPromoteToType<i32>>, + +  // The 'nest' parameter, if any, is passed in ECX. +  CCIfNest<CCAssignToReg<[ECX]>>, + +  // The first 3 integer arguments, if marked 'inreg' and if the call is not +  // a vararg call, are passed in integer registers. +  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>, + +  // Otherwise, same as everything else. +  CCDelegateTo<CC_X86_32_Common> +]>;  def CC_X86_32_FastCall : CallingConv<[    // Promote i8/i16 arguments to i32. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1917a6a291f..8767d8d33b9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32,6 +32,8 @@  #include "llvm/CodeGen/SelectionDAG.h"  #include "llvm/CodeGen/SSARegMap.h"  #include "llvm/Support/MathExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h"  #include "llvm/Target/TargetOptions.h"  #include "llvm/ADT/StringExtras.h"  #include "llvm/ParameterAttributes.h" @@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)    X86ScalarSSEf64 = Subtarget->hasSSE2();    X86ScalarSSEf32 = Subtarget->hasSSE1();    X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; +      RegInfo = TM.getRegisterInfo(); @@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)  //===----------------------------------------------------------------------===//  #include "X86GenCallingConv.inc" + +/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it +/// exists skip possible ISD:TokenFactor. +static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) { +  if (Chain.getOpcode()==X86ISD::TAILCALL) { +    return Chain; +  } else if (Chain.getOpcode()==ISD::TokenFactor) { +    if (Chain.getNumOperands() && +        Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL) +      return Chain.getOperand(0); +  } +  return Chain; +}  /// LowerRET - Lower an ISD::RET node.  SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { @@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {    bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();    CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);    CCInfo.AnalyzeReturn(Op.Val, RetCC_X86); -   -   +        // If this is the first return lowered for this function, add the regs to the    // liveout set for the function.    if (DAG.getMachineFunction().liveout_empty()) { @@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {        if (RVLocs[i].isRegLoc())          DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());    } -      SDOperand Chain = Op.getOperand(0); -  SDOperand Flag; +  // Handle tail call return. +  Chain = GetPossiblePreceedingTailCall(Chain); +  if (Chain.getOpcode() == X86ISD::TAILCALL) { +    SDOperand TailCall = Chain; +    SDOperand TargetAddress = TailCall.getOperand(1); +    SDOperand StackAdjustment = TailCall.getOperand(2); +    assert ( ((TargetAddress.getOpcode() == ISD::Register && +               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX || +                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || +              TargetAddress.getOpcode() == ISD::TargetExternalSymbol || +              TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&  +             "Expecting an global address, external symbol, or register"); +    assert( StackAdjustment.getOpcode() == ISD::Constant && +            "Expecting a const value"); + +    SmallVector<SDOperand,8> Operands; +    Operands.push_back(Chain.getOperand(0)); +    Operands.push_back(TargetAddress); +    Operands.push_back(StackAdjustment); +    // Copy registers used by the call. Last operand is a flag so it is not +    // copied. +    for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) { +      Operands.push_back(Chain.getOperand(i)); +    } +    return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size());  +  } +   +  // Regular return. +  SDOperand Flag; +    // Copy the result values into the output registers.    if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||        RVLocs[0].getLocReg() != X86::ST0) { @@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {      if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) ||          (X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) {        SDOperand MemLoc; -       +                // If this is a load into a scalarsse value, don't store the loaded value        // back to the stack, only to reload it: just replace the scalar-sse load.        if (ISD::isNON_EXTLoad(Value.Val) && @@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,  //===----------------------------------------------------------------------===// -//                C & StdCall Calling Convention implementation +//                C & StdCall & Fast Calling Convention implementation  //===----------------------------------------------------------------------===//  //  StdCall calling convention seems to be standard for many Windows' API  //  routines and around. It differs from C calling convention just a little:  //  callee should clean up the stack, not caller. Symbols should be also  //  decorated in some fancy way :) It doesn't support any vector arguments. +//  For info on fast calling convention see Fast Calling Convention (tail call) +//  implementation LowerX86_32FastCCCallTo.  /// AddLiveIn - This helper function adds the specified physical register to the  /// MachineFunction as a live in value.  It also creates a corresponding virtual @@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,    return VReg;  } +// align stack arguments according to platform alignment needed for tail calls +unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG); +  SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,                                                const CCValAssign &VA,                                                MachineFrameInfo *MFI, @@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,    MachineFrameInfo *MFI = MF.getFrameInfo();    SDOperand Root = Op.getOperand(0);    bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - +  unsigned CC = MF.getFunction()->getCallingConv();    // Assign locations to all of the incoming arguments.    SmallVector<CCValAssign, 16> ArgLocs; -  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, +  CCState CCInfo(CC, isVarArg,                   getTargetMachine(), ArgLocs); -  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); -    +  // Check for possible tail call calling convention. +  if (CC == CallingConv::Fast && PerformTailCallOpt)  +    CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall); +  else +    CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); +      SmallVector<SDOperand, 8> ArgValues;    unsigned LastVal = ~0U;    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,    }    unsigned StackSize = CCInfo.getNextStackOffset(); +  // align stack specially for tail calls +  if (CC==CallingConv::Fast) +    StackSize = GetAlignedArgumentStackSize(StackSize,DAG);    ArgValues.push_back(Root); @@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,    if (isVarArg)      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); -  if (isStdCall && !isVarArg) { +  // Tail call calling convention (CallingConv::Fast) does not support varargs. +  assert( !(isVarArg && CC == CallingConv::Fast) &&  +         "CallingConv::Fast does not support varargs."); + +  if (isStdCall && !isVarArg &&  +      (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) {      BytesToPopOnReturn  = StackSize;    // Callee pops everything..      BytesCallerReserves = 0;    } else { @@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,                                              unsigned CC) {    SDOperand Chain     = Op.getOperand(0);    bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; -  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;    SDOperand Callee    = Op.getOperand(4);    unsigned NumOps     = (Op.getNumOperands() - 5) / 2; - +     // Analyze operands of the call, assigning locations to each operand.    SmallVector<CCValAssign, 16> ArgLocs;    CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); -  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); +  if(CC==CallingConv::Fast && PerformTailCallOpt) +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); +  else +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);    // Get a count of how many bytes are to be pushed on the stack.    unsigned NumBytes = CCInfo.getNextStackOffset(); +  if (CC==CallingConv::Fast) +    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);    Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); @@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,    if (InFlag.Val)      Ops.push_back(InFlag); - -  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, -                      NodeTys, &Ops[0], Ops.size()); +  +  Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());    InFlag = Chain.getValue(1);    // Create the CALLSEQ_END node.    unsigned NumBytesForCalleeToPush = 0; -  if (CC == CallingConv::X86_StdCall) { +  if (CC == CallingConv::X86_StdCall ||  +      (CC == CallingConv::Fast && PerformTailCallOpt)) {      if (isVarArg)        NumBytesForCalleeToPush = isSRet ? 4 : 0;      else        NumBytesForCalleeToPush = NumBytes; +    assert(!(isVarArg && CC==CallingConv::Fast) && +            "CallingConv::Fast does not support varargs.");    } else {      // If this is is a call to a struct-return function, the callee      // pops the hidden struct pointer, so we have to push it back. @@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {    if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {      // Make sure the instruction takes 8n+4 bytes to make sure the start of the -    // arguments and the arguments after the retaddr has been pushed are aligned. +    // arguments and the arguments after the retaddr has been pushed are +    // aligned.      if ((StackSize & 7) == 0)        StackSize += 4;    } @@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,    if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {      // Make sure the instruction takes 8n+4 bytes to make sure the start of the -    // arguments and the arguments after the retaddr has been pushed are aligned. +    // arguments and the arguments after the retaddr has been pushed are +    // aligned.      if ((NumBytes & 7) == 0)        NumBytes += 4;    } @@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,    if (InFlag.Val)      Ops.push_back(InFlag); -  // FIXME: Do not generate X86ISD::TAILCALL for now. -  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, +  assert(isTailCall==false && "no tail call here"); +  Chain = DAG.getNode(X86ISD::CALL,                        NodeTys, &Ops[0], Ops.size());    InFlag = Chain.getValue(1); @@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,    return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);  } +//===----------------------------------------------------------------------===// +//                Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +//  Like std call, callee cleans arguments, convention except that ECX is +//  reserved for storing the tail called function address. Only 2 registers are +//  free for argument passing (inreg). Tail call optimization is performed +//  provided: +//                * tailcallopt is enabled +//                * caller/callee are fastcc +//                * elf/pic is disabled OR +//                * elf/pic enabled + callee is in module + callee has +//                  visibility protected or hidden +//  To ensure the stack is aligned according to platform abi pass +//  tail-call-align-stack. This makes sure that argument delta is always +//  multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for +//  example) +//  If a tail called function callee has more arguments than the caller the +//  caller needs to make sure that there is room to move the RETADDR to. This is +//  achived by reserving an area the size of the argument delta right after the +//  original REtADDR, but before the saved framepointer or the spilled registers +//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +//  stack layout: +//    arg1 +//    arg2 +//    RETADDR +//    [ new RETADDR  +//      move area ] +//    (possible EBP) +//    ESI +//    EDI +//    local1 .. + +/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned +/// for a 16 byte align requirement. +unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,  +                                                        SelectionDAG& DAG) { +  if (PerformTailCallOpt) { +    MachineFunction &MF = DAG.getMachineFunction(); +    const TargetMachine &TM = MF.getTarget(); +    const TargetFrameInfo &TFI = *TM.getFrameInfo(); +    unsigned StackAlignment = TFI.getStackAlignment(); +    uint64_t AlignMask = StackAlignment - 1;  +    int64_t Offset = StackSize; +    unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4; +    if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { +      // Number smaller than 12 so just add the difference. +      Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); +    } else { +      // Mask out lower bits, add stackalignment once plus the 12 bytes. +      Offset = ((~AlignMask) & Offset) + StackAlignment +  +        (StackAlignment-SlotSize); +    } +    StackSize = Offset; +  } +  return StackSize; +} + +/// IsEligibleForTailCallElimination - Check to see whether the next instruction +// following the call is a return. A function is eligible if caller/callee +// calling conventions match, currently only fastcc supports tail calls, and the +// function CALL is immediatly followed by a RET. +bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call, +                                                      SDOperand Ret, +                                                      SelectionDAG& DAG) const { +  bool IsEligible = false; + +  // Check whether CALL node immediatly preceeds the RET node and whether the +  // return uses the result of the node or is a void return. +  if ((Ret.getNumOperands() == 1 &&  +       (Ret.getOperand(0)== SDOperand(Call.Val,1) || +        Ret.getOperand(0)== SDOperand(Call.Val,0))) || +      (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) && +       Ret.getOperand(1)== SDOperand(Call.Val,0))) { +    MachineFunction &MF = DAG.getMachineFunction(); +    unsigned CallerCC = MF.getFunction()->getCallingConv(); +    unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue(); +    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { +      SDOperand Callee = Call.getOperand(4); +      // On elf/pic %ebx needs to be livein. +      if(getTargetMachine().getRelocationModel() == Reloc::PIC_ && +         Subtarget->isPICStyleGOT()) { +        // Can only do local tail calls with PIC. +        GlobalValue * GV = 0; +        GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); +        if(G != 0 && +           (GV = G->getGlobal()) && +           (GV->hasHiddenVisibility() || GV->hasProtectedVisibility())) +          IsEligible=true; +      } else { +        IsEligible=true; +      } +    } +  } +  return IsEligible; +} + +SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op,  +                                                     SelectionDAG &DAG, +                                                     unsigned CC) { +  SDOperand Chain     = Op.getOperand(0); +  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; +  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; +  SDOperand Callee    = Op.getOperand(4); +  bool is64Bit        = Subtarget->is64Bit(); + +  assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls."); + +  // Analyze operands of the call, assigning locations to each operand. +  SmallVector<CCValAssign, 16> ArgLocs; +  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); +  if (is64Bit) +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall); +  else +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); +   +   +  // Lower arguments at fp - stackoffset + fpdiff. +  MachineFunction &MF = DAG.getMachineFunction(); + +  unsigned NumBytesToBePushed =  +    GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG); +     +  unsigned NumBytesCallerPushed =  +    MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); +  int FPDiff = NumBytesCallerPushed - NumBytesToBePushed; + +  // Set the delta of movement of the returnaddr stackslot. +  // But only set if delta is greater than previous delta. +  if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) +    MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + +  // Adjust the ret address stack slot. +  if (FPDiff) { +    MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32; +    SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG);  +    RetAddrFrIdx =  +      DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0); +    // Emit a store of the saved ret value to the new location. +    int SlotSize = is64Bit ? 8 : 4; +    int NewReturnAddrFI =  +      MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); +    SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); +    Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0); +  } + +  Chain = DAG. +   getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy())); + +  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; +  SmallVector<SDOperand, 8> MemOpChains; +  SmallVector<SDOperand, 8> MemOpChains2; +  SDOperand FramePtr, StackPtr; +  SDOperand PtrOff; +  SDOperand FIN; +  int FI = 0; + +  // Walk the register/memloc assignments, inserting copies/loads.  Lower +  // arguments first to the stack slot where they would normally - in case of a +  // normal function call - be. +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { +    CCValAssign &VA = ArgLocs[i]; +    SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); +     +    // Promote the value if needed. +    switch (VA.getLocInfo()) { +    default: assert(0 && "Unknown loc info!"); +    case CCValAssign::Full: break; +    case CCValAssign::SExt: +      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); +      break; +    case CCValAssign::ZExt: +      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); +      break; +    case CCValAssign::AExt: +      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); +      break; +    } +     +    if (VA.isRegLoc()) { +      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); +    } else { +      assert(VA.isMemLoc()); +      if (StackPtr.Val == 0) +        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + +      MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain, +                                             Arg)); +    } +  } + +  if (!MemOpChains.empty()) +    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, +                        &MemOpChains[0], MemOpChains.size()); + +  // Build a sequence of copy-to-reg nodes chained together with token chain +  // and flag operands which copy the outgoing args into registers. +  SDOperand InFlag; +  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { +    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, +                             InFlag); +    InFlag = Chain.getValue(1); +  } +  InFlag = SDOperand(); +  // Copy from stack slots to stack slot of a tail called function. This needs +  // to be done because if we would lower the arguments directly to their real +  // stack slot we might end up overwriting each other. +  // TODO: To make this more efficient (sometimes saving a store/load) we could +  // analyse the arguments and emit this store/load/store sequence only for +  // arguments which would be overwritten otherwise. +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { +    CCValAssign &VA = ArgLocs[i]; +    if (!VA.isRegLoc()) { +      SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo()); +      unsigned Flags    = cast<ConstantSDNode>(FlagsOp)->getValue(); +       +      // Get source stack slot.  +      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); +      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); +      // Create frame index. +      int32_t Offset = VA.getLocMemOffset()+FPDiff; +      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; +      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); +      FIN = DAG.getFrameIndex(FI, MVT::i32); +      if (Flags & ISD::ParamFlags::ByVal) { +        // Copy relative to framepointer. +        unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >> +                               ISD::ParamFlags::ByValAlignOffs); + +        unsigned  Size = (Flags & ISD::ParamFlags::ByValSize) >> +          ISD::ParamFlags::ByValSizeOffs; +  +        SDOperand AlignNode = DAG.getConstant(Align, MVT::i32); +        SDOperand  SizeNode = DAG.getConstant(Size, MVT::i32); +        // Copy relative to framepointer. +        MemOpChains2.push_back(DAG.getNode(ISD::MEMCPY, MVT::Other, Chain, FIN, +                                           PtrOff, SizeNode, AlignNode)); +      } else { +        SDOperand LoadedArg = DAG.getLoad(VA.getValVT(), Chain, PtrOff, NULL,0); +        // Store relative to framepointer. +        MemOpChains2.push_back(DAG.getStore(Chain, LoadedArg, FIN, NULL, 0)); +      } +    } +  } + +  if (!MemOpChains2.empty()) +    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, +                        &MemOpChains2[0], MemOpChains.size()); + +  // ELF / PIC requires GOT in the EBX register before function calls via PLT +  // GOT pointer. +  // Does not work with tail call since ebx is not restored correctly by +  // tailcaller. TODO: at least for x86 - verify for x86-64 + +  // If the callee is a GlobalAddress node (quite common, every direct call is) +  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. +  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { +    // We should use extra load for direct calls to dllimported functions in +    // non-JIT mode. +    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), +                                        getTargetMachine(), true)) +      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); +  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) +    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); +  else { +    assert(Callee.getOpcode() == ISD::LOAD &&  +           "Function destination must be loaded into virtual register"); +    unsigned Opc = is64Bit ? X86::R9 : X86::ECX; + +    Chain = DAG.getCopyToReg(Chain,  +                             DAG.getRegister(Opc, getPointerTy()) ,  +                             Callee,InFlag); +    Callee = DAG.getRegister(Opc, getPointerTy()); +    // Add register as live out. +    DAG.getMachineFunction().addLiveOut(Opc); +  } +    +  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); +  SmallVector<SDOperand, 8> Ops; + +  Ops.push_back(Chain); +  Ops.push_back(DAG.getConstant(NumBytesToBePushed, getPointerTy())); +  Ops.push_back(DAG.getConstant(0, getPointerTy())); +  if (InFlag.Val) +    Ops.push_back(InFlag); +  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); +  InFlag = Chain.getValue(1); + +  // Returns a chain & a flag for retval copy to use. +  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); +  Ops.clear(); +  Ops.push_back(Chain); +  Ops.push_back(Callee); +  Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); +  // Add argument registers to the end of the list so that they are known live +  // into the call. +  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) +    Ops.push_back(DAG.getRegister(RegsToPass[i].first, +                                  RegsToPass[i].second.getValueType())); +  if (InFlag.Val) +    Ops.push_back(InFlag); +  assert(InFlag.Val &&  +         "Flag must be set. Depend on flag being set in LowerRET"); +  Chain = DAG.getNode(X86ISD::TAILCALL, +                      Op.Val->getVTList(), &Ops[0], Ops.size()); +     +  return SDOperand(Chain.Val, Op.ResNo); +}  //===----------------------------------------------------------------------===//  //                 X86-64 C Calling Convention implementation @@ -1323,6 +1699,7 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {    MachineFrameInfo *MFI = MF.getFrameInfo();    SDOperand Root = Op.getOperand(0);    bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; +  unsigned CC= MF.getFunction()->getCallingConv();    static const unsigned GPR64ArgRegs[] = {      X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9 @@ -1335,9 +1712,12 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {    // Assign locations to all of the incoming arguments.    SmallVector<CCValAssign, 16> ArgLocs; -  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, +  CCState CCInfo(CC, isVarArg,                   getTargetMachine(), ArgLocs); -  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C); +  if (CC == CallingConv::Fast && PerformTailCallOpt) +    CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_TailCall); +  else +    CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);    SmallVector<SDOperand, 8> ArgValues;    unsigned LastVal = ~0U; @@ -1398,10 +1778,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {    }    unsigned StackSize = CCInfo.getNextStackOffset(); +  if (CC==CallingConv::Fast) +    StackSize =GetAlignedArgumentStackSize(StackSize, DAG);    // If the function takes variable number of arguments, make a frame index for    // the start of the first vararg value... for expansion of llvm.va_start.    if (isVarArg) { +    assert(CC!=CallingConv::Fast  +           && "Var arg not supported with calling convention fastcc");      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); @@ -1446,10 +1830,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {    }    ArgValues.push_back(Root); - -  BytesToPopOnReturn = 0;  // Callee pops nothing. -  BytesCallerReserves = StackSize; - +  // Tail call convention (fastcc) needs callee pop. +  if (CC == CallingConv::Fast && PerformTailCallOpt){ +    BytesToPopOnReturn = StackSize;  // Callee pops everything. +    BytesCallerReserves = 0; +  } else { +    BytesToPopOnReturn = 0;  // Callee pops nothing. +    BytesCallerReserves = StackSize; +  }    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();    FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); @@ -1463,16 +1851,21 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,                                          unsigned CC) {    SDOperand Chain     = Op.getOperand(0);    bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; -  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;    SDOperand Callee    = Op.getOperand(4);    // Analyze operands of the call, assigning locations to each operand.    SmallVector<CCValAssign, 16> ArgLocs;    CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); -  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C); +  if (CC==CallingConv::Fast) +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall); +  else +    CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);    // Get a count of how many bytes are to be pushed on the stack.    unsigned NumBytes = CCInfo.getNextStackOffset(); +  if (CC == CallingConv::Fast) +    NumBytes = GetAlignedArgumentStackSize(NumBytes,DAG); +    Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));    SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; @@ -1526,6 +1919,9 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,    }    if (isVarArg) { +    assert ( CallingConv::Fast != CC && +             "Var args not supported with calling convention fastcc"); +      // From AMD64 ABI document:      // For calls that may call functions that use varargs or stdargs      // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -1574,17 +1970,22 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,    if (InFlag.Val)      Ops.push_back(InFlag); -  // FIXME: Do not generate X86ISD::TAILCALL for now. -  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, +  Chain = DAG.getNode(X86ISD::CALL,                        NodeTys, &Ops[0], Ops.size());    InFlag = Chain.getValue(1); - +  int NumBytesForCalleeToPush = 0; +   if (CC==CallingConv::Fast) { +    NumBytesForCalleeToPush = NumBytes;  // Callee pops everything +   +  } else { +    NumBytesForCalleeToPush = 0;  // Callee pops nothing. +  }    // Returns a flag for retval copy to use.    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);    Ops.clear();    Ops.push_back(Chain);    Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); -  Ops.push_back(DAG.getConstant(0, getPointerTy())); +  Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));    Ops.push_back(InFlag);    Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());    InFlag = Chain.getValue(1); @@ -3106,10 +3507,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {      // SHUFPS the element to the lowest double word, then movss.      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);      SmallVector<SDOperand, 8> IdxVec; -    IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT))); -    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); -    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); -    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); +    IdxVec. +      push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT))); +    IdxVec. +      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); +    IdxVec. +      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); +    IdxVec. +      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,                                   &IdxVec[0], IdxVec.size());      Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), @@ -3128,7 +3533,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);      SmallVector<SDOperand, 8> IdxVec;      IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT))); -    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); +    IdxVec. +      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,                                   &IdxVec[0], IdxVec.size());      Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), @@ -3777,17 +4183,23 @@ SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) {  }  SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) { -  unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue(); - -  if (Subtarget->is64Bit()) -    return LowerX86_64CCCCallTo(Op, DAG, CallingConv); +  unsigned CallingConv = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); +  bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + +   if (Subtarget->is64Bit()) +     if(CallingConv==CallingConv::Fast && isTailCall && PerformTailCallOpt) +       return LowerX86_TailCallTo(Op, DAG, CallingConv); +     else +       return LowerX86_64CCCCallTo(Op, DAG, CallingConv);    else      switch (CallingConv) {      default:        assert(0 && "Unsupported calling convention");      case CallingConv::Fast: -      // TODO: Implement fastcc -      // Falls through +      if (isTailCall && PerformTailCallOpt) +        return LowerX86_TailCallTo(Op, DAG, CallingConv); +      else +        return LowerCCCCallTo(Op,DAG, CallingConv);      case CallingConv::C:      case CallingConv::X86_StdCall:        return LowerCCCCallTo(Op, DAG, CallingConv); @@ -3855,8 +4267,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {      default:        assert(0 && "Unsupported calling convention");      case CallingConv::Fast: -      // TODO: implement fastcc. -       +      return LowerCCCArguments(Op,DAG, true);        // Falls through      case CallingConv::C:        return LowerCCCArguments(Op, DAG); @@ -4176,7 +4587,8 @@ X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) {    SDOperand TheOp = Op.getOperand(0);    SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1);    if (Subtarget->is64Bit()) { -    SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1)); +    SDOperand Copy1 =  +      DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));      SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX,                                           MVT::i64, Copy1.getValue(2));      SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2, @@ -4612,6 +5024,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";    case X86ISD::THREAD_POINTER:     return "X86ISD::THREAD_POINTER";    case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN"; +  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";    }  } @@ -4885,7 +5298,7 @@ static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {    i %= NumElems;    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {      return (i == 0) -      ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT)); +     ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));    } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {      SDOperand Idx = PermMask.getOperand(i);      if (Idx.getOpcode() == ISD::UNDEF) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index aa579d69f34..7123adaad27 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -181,7 +181,14 @@ namespace llvm {        TLSADDR, THREAD_POINTER,        // Exception Handling helpers -      EH_RETURN +      EH_RETURN, +       +      // tail call return  +      //   oeprand #0 chain +      //   operand #1 callee (register or absolute) +      //   operand #2 stack adjustment +      //   operand #3 optional in flag +      TC_RETURN      };    } @@ -285,6 +292,7 @@ namespace llvm {      unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.      int BytesToPopOnReturn;           // Number of arg bytes ret should pop.      int BytesCallerReserves;          // Number of arg bytes caller makes. +    public:      explicit X86TargetLowering(TargetMachine &TM); @@ -364,6 +372,14 @@ namespace llvm {      virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,                                          MVT::ValueType EVT,                                          SelectionDAG &DAG) const; +     +    /// IsEligibleForTailCallOptimization - Check whether the call is eligible +    /// for tail call optimization. Target which want to do tail call +    /// optimization should implement this function. +    virtual bool IsEligibleForTailCallOptimization(SDOperand Call,  +                                                   SDOperand Ret,  +                                                   SelectionDAG &DAG) const; +    private:      /// Subtarget - Keep a pointer to the X86Subtarget around so that we can      /// make the right decision when generating code for different targets. @@ -372,7 +388,7 @@ namespace llvm {      /// X86StackPtr - X86 physical register used as stack ptr.      unsigned X86StackPtr; - +         /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87       /// floating point ops.      /// When SSE is available, use it for f32 operations. @@ -402,6 +418,10 @@ namespace llvm {      SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);      SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC); +    // fast calling convention (tail call) implementation for 32/64bit +    SDOperand LowerX86_TailCallTo(SDOperand Op,  +                                      SelectionDAG & DAG, unsigned CC); +    unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);      // Fast and FastCall Calling Convention implementation.      SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);      SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 986aa0bc806..9d5e6371199 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -706,6 +706,8 @@ bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {    if (MBB.empty()) return false;    switch (MBB.back().getOpcode()) { +  case X86::TCRETURNri: +  case X86::TCRETURNdi:    case X86::RET:     // Return.    case X86::RETI:    case X86::TAILJMPd: diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 641eb2f7cc6..a6bd3fbbbde 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -55,6 +55,8 @@ def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;  def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; +  def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;  def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>; @@ -73,7 +75,7 @@ def X86callseq_start :                          [SDNPHasChain, SDNPOutFlag]>;  def X86callseq_end :                   SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd, -                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; +                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;         def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,                          [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>; @@ -99,6 +101,8 @@ def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;  def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,                          [SDNPHasChain]>; +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,  +                        [SDNPHasChain,  SDNPOptInFlag]>;  //===----------------------------------------------------------------------===//  // X86 Operand Definitions. @@ -356,15 +360,30 @@ let isCall = 1 in    }  // Tail call stuff. + +def TAILCALL : I<0, Pseudo, (outs), (ins ), +                         "#TAILCALL", +                         []>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in +def TCRETURNdi : I<0, Pseudo, (outs), (ins i32imm:$dst, i32imm:$offset), +                 "#TC_RETURN $dst $offset", +                 []>; +  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in -  def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call}  # TAIL CALL", +def TCRETURNri : I<0, Pseudo, (outs), (ins GR32:$dst, i32imm:$offset), +                 "#TC_RETURN $dst $offset",                   []>; +  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in -  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp\t{*}$dst  # TAIL CALL", +  def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call}  # TAILCALL",                   []>;  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in +  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst  # TAILCALL", +                 []>;      +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in    def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), -                   "jmp\t{*}$dst  # TAIL CALL", []>; +                   "jmp\t{*}$dst  # TAILCALL", []>;  //===----------------------------------------------------------------------===//  //  Miscellaneous Instructions... @@ -2507,13 +2526,23 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),            (MOV32mi addr:$dst, texternalsym:$src)>;  // Calls +// tailcall stuff  def : Pat<(X86tailcall GR32:$dst), -          (CALL32r     GR32:$dst)>; +          (TAILCALL)>;  def : Pat<(X86tailcall (i32 tglobaladdr:$dst)), -          (CALLpcrel32 tglobaladdr:$dst)>; +          (TAILCALL)>;  def : Pat<(X86tailcall (i32 texternalsym:$dst)), -          (CALLpcrel32 texternalsym:$dst)>; +          (TAILCALL)>; + +def : Pat<(X86tcret GR32:$dst, imm:$off), +          (TCRETURNri GR32:$dst, imm:$off)>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), +          (TCRETURNdi texternalsym:$dst, imm:$off)>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), +          (TCRETURNdi texternalsym:$dst, imm:$off)>;  def : Pat<(X86call (i32 tglobaladdr:$dst)),            (CALLpcrel32 tglobaladdr:$dst)>; diff --git a/llvm/lib/Target/X86/X86InstrX86-64.td b/llvm/lib/Target/X86/X86InstrX86-64.td index f6f48a21d47..f501b5ec558 100644 --- a/llvm/lib/Target/X86/X86InstrX86-64.td +++ b/llvm/lib/Target/X86/X86InstrX86-64.td @@ -102,6 +102,23 @@ let isCall = 1 in                            "call\t{*}$dst", []>;    } + + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in +def TCRETURNdi64 : I<0, Pseudo, (outs), (ins i64imm:$dst, i32imm:$offset), +                 "#TC_RETURN $dst $offset", +                 []>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in +def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64:$dst, i32imm:$offset), +                 "#TC_RETURN $dst $offset", +                 []>; + + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in +  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst  # TAILCALL", +                 []>;      +  // Branches  let isBranch = 1, isTerminator = 1, isBarrier = 1 in {    def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", @@ -1105,6 +1122,24 @@ def : Pat<(X86tailcall (i64 texternalsym:$dst)),  def : Pat<(X86tailcall GR64:$dst),            (CALL64r GR64:$dst)>; + +// tailcall stuff +def : Pat<(X86tailcall GR32:$dst), +          (TAILCALL)>; +def : Pat<(X86tailcall (i64 tglobaladdr:$dst)), +          (TAILCALL)>; +def : Pat<(X86tailcall (i64 texternalsym:$dst)), +          (TAILCALL)>; + +def : Pat<(X86tcret GR64:$dst, imm:$off), +          (TCRETURNri64 GR64:$dst, imm:$off)>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), +          (TCRETURNdi64 texternalsym:$dst, imm:$off)>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), +          (TCRETURNdi64 texternalsym:$dst, imm:$off)>; +  // Comparisons.  // TEST R,R is smaller than CMP R,0 diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index e50a104f216..05972c66c27 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -47,18 +47,26 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {    // FrameIndex for return slot.    int ReturnAddrIndex; + +  // Delta the ReturnAddr stack slot is moved +  // Used for creating an area before the register spill area on the stack +  // the returnaddr can be savely move to this area +  int TailCallReturnAddrDelta; +  public:    X86MachineFunctionInfo() : ForceFramePointer(false),                               CalleeSavedFrameSize(0),                               BytesToPopOnReturn(0),                               DecorationStyle(None), -                             ReturnAddrIndex(0) {} +                             ReturnAddrIndex(0), +                             TailCallReturnAddrDelta(0){}    X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),                                                  CalleeSavedFrameSize(0),                                                  BytesToPopOnReturn(0),                                                  DecorationStyle(None), -                                                ReturnAddrIndex(0) {} +                                                ReturnAddrIndex(0), +                                                TailCallReturnAddrDelta(0) {}    bool getForceFramePointer() const { return ForceFramePointer;}     void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } @@ -74,6 +82,9 @@ public:    int getRAIndex() const { return ReturnAddrIndex; }    void setRAIndex(int Index) { ReturnAddrIndex = Index; } + +  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } +  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}  };  } // End llvm namespace diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 83cb03c76f8..f017d4020ae 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1436,18 +1436,42 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,    if (!hasFP(MF))      Offset += MF.getFrameInfo()->getStackSize(); -  else +  else {      Offset += SlotSize;  // Skip the saved EBP - +    // Skip the RETADDR move area +    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); +    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); +    if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; +  } +      MI.getOperand(i+3).ChangeToImmediate(Offset);  }  void  X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{ +  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); +  int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); +  if (TailCallReturnAddrDelta < 0) { +    // create RETURNADDR area +    //   arg +    //   arg +    //   RETADDR +    //   { ... +    //     RETADDR area +    //     ... +    //   } +    //   [EBP] +    MF.getFrameInfo()-> +      CreateFixedObject(-TailCallReturnAddrDelta, +                        (-1*SlotSize)+TailCallReturnAddrDelta); +  }    if (hasFP(MF)) { +    assert((TailCallReturnAddrDelta <= 0) && +           "The Delta should always be zero or negative");      // Create a frame entry for the EBP register that must be saved.      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, -                                                        (int)SlotSize * -2); +                                                        (int)SlotSize * -2+ +                                                       TailCallReturnAddrDelta);      assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&             "Slot for EBP register must be last in order to be found!");    } @@ -1530,6 +1554,41 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,    }  } +/// mergeSPUpdates - Checks the instruction before/after the passed +/// instruction. If it is an ADD/SUB instruction it is deleted  +/// argument and the stack adjustment is returned as a positive value for ADD +/// and a negative for SUB.  +static int mergeSPUpdates(MachineBasicBlock &MBB, +                           MachineBasicBlock::iterator &MBBI, +                           unsigned StackPtr,                      +                           bool doMergeWithPrevious) { + +  if ((doMergeWithPrevious && MBBI == MBB.begin()) || +      (!doMergeWithPrevious && MBBI == MBB.end())) +    return 0; + +  int Offset = 0; + +  MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI; +  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI); +  unsigned Opc = PI->getOpcode(); +  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || +       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && +      PI->getOperand(0).getReg() == StackPtr){ +    Offset += PI->getOperand(2).getImm(); +    MBB.erase(PI); +    if (!doMergeWithPrevious) MBBI = NI; +  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || +              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && +             PI->getOperand(0).getReg() == StackPtr) { +    Offset -= PI->getOperand(2).getImm(); +    MBB.erase(PI); +    if (!doMergeWithPrevious) MBBI = NI; +  }    + +  return Offset; +} +  void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {    MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB    MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1543,10 +1602,23 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {    // Prepare for frame info.    unsigned FrameLabelId = 0; -  // Get the number of bytes to allocate from the FrameInfo +  // Get the number of bytes to allocate from the FrameInfo.    uint64_t StackSize = MFI->getStackSize(); +  // Add RETADDR move area to callee saved frame size. +  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); +  if (TailCallReturnAddrDelta < 0)   +    X86FI->setCalleeSavedFrameSize( +          X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));    uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); +  // Insert stack pointer adjustment for later moving of return addr.  Only +  // applies to tail call optimized functions where the callee argument stack +  // size is bigger than the callers. +  if (TailCallReturnAddrDelta < 0) { +    BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),  +            StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta); +  } +    if (hasFP(MF)) {      // Get the offset of the stack slot for the EBP register... which is      // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -1615,6 +1687,10 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {          MBB.insert(MBBI, MI);        }      } else { +      // If there is an SUB32ri of ESP immediately before this instruction, +      // merge the two. This can be the case when tail call elimination is +      // enabled and the callee has more arguments then the caller. +      NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);        // If there is an ADD32ri or SUB32ri of ESP immediately after this        // instruction, merge the two instructions.        mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); @@ -1711,6 +1787,10 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,    switch (RetOpcode) {    case X86::RET:    case X86::RETI: +  case X86::TCRETURNdi: +  case X86::TCRETURNri: +  case X86::TCRETURNri64: +  case X86::TCRETURNdi64:    case X86::EH_RETURN:    case X86::TAILJMPd:    case X86::TAILJMPr: @@ -1773,7 +1853,46 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,      MachineOperand &DestAddr  = MBBI->getOperand(0);      assert(DestAddr.isRegister() && "Offset should be in register!");      BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr). -      addReg(DestAddr.getReg()); +      addReg(DestAddr.getReg());  +  // Tail call return: adjust the stack pointer and jump to callee +  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || +             RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) { +    MBBI = prior(MBB.end()); +    MachineOperand &JumpTarget = MBBI->getOperand(0); +    MachineOperand &StackAdjust = MBBI->getOperand(1); +    assert( StackAdjust.isImmediate() && "Expecting immediate value."); +     +    // Adjust stack pointer. +    int StackAdj = StackAdjust.getImm(); +    int MaxTCDelta = X86FI->getTCReturnAddrDelta(); +    int Offset = 0; +    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); +    // Incoporate the retaddr area. +    Offset = StackAdj-MaxTCDelta; +    assert(Offset >= 0 && "Offset should never be negative"); +    if (Offset) { +      // Check for possible merge with preceeding ADD instruction. +      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); +      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII); +    }  +    // Jump to label or value in register. +    if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64) +      BuildMI(MBB, MBBI, TII.get(X86::TAILJMPd)). +        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); +    else if (RetOpcode== X86::TCRETURNri64) { +      BuildMI(MBB, MBBI, TII.get(X86::TAILJMPr64), JumpTarget.getReg()); +    } else +       BuildMI(MBB, MBBI, TII.get(X86::TAILJMPr), JumpTarget.getReg()); +    // Delete the pseudo instruction TCRETURN. +    MBB.erase(MBBI); +  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&  +             (X86FI->getTCReturnAddrDelta() < 0)) { +    // Add the return addr area delta back since we are not tail calling. +    int delta = -1*X86FI->getTCReturnAddrDelta(); +    MBBI = prior(MBB.end()); +    // Check for possible merge with preceeding ADD instruction. +    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); +    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);    }  } diff --git a/llvm/test/CodeGen/X86/tailcall1.ll b/llvm/test/CodeGen/X86/tailcall1.ll new file mode 100644 index 00000000000..74687f584ab --- /dev/null +++ b/llvm/test/CodeGen/X86/tailcall1.ll @@ -0,0 +1,11 @@ +; RUN: llvm-as < %s | llc  -tailcallopt | grep TAILCALL +define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +entry: +	ret i32 %a3 +} + +define fastcc i32 @tailcaller(i32 %in1, i32 %in2) { +entry: +	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1] +	ret i32 %tmp11 +} diff --git a/llvm/test/CodeGen/X86/tailcallpic1.ll b/llvm/test/CodeGen/X86/tailcallpic1.ll new file mode 100644 index 00000000000..54074eb0ba2 --- /dev/null +++ b/llvm/test/CodeGen/X86/tailcallpic1.ll @@ -0,0 +1,12 @@ +; RUN: llvm-as < %s | llc  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | grep TAILCALL + +define protected fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +entry: +	ret i32 %a3 +} + +define fastcc i32 @tailcaller(i32 %in1, i32 %in2) { +entry: +	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1] +	ret i32 %tmp11 +} diff --git a/llvm/test/CodeGen/X86/tailcallpic2.ll b/llvm/test/CodeGen/X86/tailcallpic2.ll new file mode 100644 index 00000000000..60818e4f62c --- /dev/null +++ b/llvm/test/CodeGen/X86/tailcallpic2.ll @@ -0,0 +1,12 @@ +; RUN: llvm-as < %s | llc  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | grep -v TAILCALL + +define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { +entry: +	ret i32 %a3 +} + +define fastcc i32 @tailcaller(i32 %in1, i32 %in2) { +entry: +	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1] +	ret i32 %tmp11 +}  | 

