summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
committerArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
commit9ccea99165d2e5cb4fba57b0d02e56651e2ae28c (patch)
treeb0726d27dcf2a842adb38aef6a374c5e2aee9b9e /llvm/lib/Target/X86
parent29cfef59ff083be23222631e4ebbe7f695ef8386 (diff)
downloadbcm5719-llvm-9ccea99165d2e5cb4fba57b0d02e56651e2ae28c.tar.gz
bcm5719-llvm-9ccea99165d2e5cb4fba57b0d02e56651e2ae28c.zip
Added tail call optimization to the x86 back end. It can be
enabled by passing -tailcallopt to llc. The optimization is performed if the following conditions are satisfied: * caller/callee are fastcc * elf/pic is disabled OR elf/pic enabled + callee is in module + callee has visibility protected or hidden llvm-svn: 42870
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/README.txt80
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td50
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp511
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h24
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td43
-rw-r--r--llvm/lib/Target/X86/X86InstrX86-64.td35
-rw-r--r--llvm/lib/Target/X86/X86MachineFunctionInfo.h15
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp129
9 files changed, 824 insertions, 65 deletions
diff --git a/llvm/lib/Target/X86/README.txt b/llvm/lib/Target/X86/README.txt
index 9bafff73d5c..0d4dce32d83 100644
--- a/llvm/lib/Target/X86/README.txt
+++ b/llvm/lib/Target/X86/README.txt
@@ -1368,3 +1368,83 @@ L7:
L5:
//===---------------------------------------------------------------------===//
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place if
+that was a not tail call optimized functiong call ) before moving them
+to actual stack slot. this is done to prevent overwriting of paramters
+(see example below) that might be used, since the arguments of the
+callee overwrites callers arguments.
+
+ example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+ - only push those arguments to the top of the stack that are actual
+ parameters of the caller function and have no local value in the
+ caller
+
+ in above example local does not need to be pushed onto the top of
+ the stack as it is definitetly not a caller's function parameter
+
+ - analyse the actual parameters of the callee to see which would
+ overwrite a caller paramter which is used by the callee and only
+ push them onto the top of the stack
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ here we need to push the arguments because they overwrite each other
+
+
+ code for lowering directly onto callers arguments:
++ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
++ SmallVector<SDOperand, 8> MemOpChains;
++
++ SDOperand FramePtr;
++ SDOperand PtrOff;
++ SDOperand FIN;
++ int FI = 0;
++ // Walk the register/memloc assignments, inserting copies/loads.
++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
++ CCValAssign &VA = ArgLocs[i];
++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
++
++ ....
++
++ if (VA.isRegLoc()) {
++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
++ } else {
++ assert(VA.isMemLoc());
++ // create frame index
++ int32_t Offset = VA.getLocMemOffset()+FPDiff;
++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
++ FIN = DAG.getFrameIndex(FI, MVT::i32);
++ // store relative to framepointer
++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
++ }
++ }
+//===---------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 9c2d95a1991..f23e5806563 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
]>;
+// tail call convetion (fast) one register is reserved for target address
+// namely R9
+def CC_X86_64_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+ // The first 8 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64],
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
//===----------------------------------------------------------------------===//
// X86 C Calling Convention
@@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
+/// Same as C calling convention up to nonfree ECX which is used for storing
+/// potential pointer to tail called function
+def CC_X86_32_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
def CC_X86_32_FastCall : CallingConv<[
// Promote i8/i16 arguments to i32.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1917a6a291f..8767d8d33b9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32,6 +32,8 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ParameterAttributes.h"
@@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
RegInfo = TM.getRegisterInfo();
@@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
//===----------------------------------------------------------------------===//
#include "X86GenCallingConv.inc"
+
+/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it
+/// exists skip possible ISD:TokenFactor.
+static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) {
+ if (Chain.getOpcode()==X86ISD::TAILCALL) {
+ return Chain;
+ } else if (Chain.getOpcode()==ISD::TokenFactor) {
+ if (Chain.getNumOperands() &&
+ Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL)
+ return Chain.getOperand(0);
+ }
+ return Chain;
+}
/// LowerRET - Lower an ISD::RET node.
SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
@@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
-
-
+
// If this is the first return lowered for this function, add the regs to the
// liveout set for the function.
if (DAG.getMachineFunction().liveout_empty()) {
@@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if (RVLocs[i].isRegLoc())
DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
}
-
SDOperand Chain = Op.getOperand(0);
- SDOperand Flag;
+ // Handle tail call return.
+ Chain = GetPossiblePreceedingTailCall(Chain);
+ if (Chain.getOpcode() == X86ISD::TAILCALL) {
+ SDOperand TailCall = Chain;
+ SDOperand TargetAddress = TailCall.getOperand(1);
+ SDOperand StackAdjustment = TailCall.getOperand(2);
+ assert ( ((TargetAddress.getOpcode() == ISD::Register &&
+ (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX ||
+ cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+ TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+ TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
+ "Expecting an global address, external symbol, or register");
+ assert( StackAdjustment.getOpcode() == ISD::Constant &&
+ "Expecting a const value");
+
+ SmallVector<SDOperand,8> Operands;
+ Operands.push_back(Chain.getOperand(0));
+ Operands.push_back(TargetAddress);
+ Operands.push_back(StackAdjustment);
+ // Copy registers used by the call. Last operand is a flag so it is not
+ // copied.
+ for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) {
+ Operands.push_back(Chain.getOperand(i));
+ }
+ return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size());
+ }
+
+ // Regular return.
+ SDOperand Flag;
+
// Copy the result values into the output registers.
if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
RVLocs[0].getLocReg() != X86::ST0) {
@@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) ||
(X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) {
SDOperand MemLoc;
-
+
// If this is a load into a scalarsse value, don't store the loaded value
// back to the stack, only to reload it: just replace the scalar-sse load.
if (ISD::isNON_EXTLoad(Value.Val) &&
@@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
//===----------------------------------------------------------------------===//
-// C & StdCall Calling Convention implementation
+// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
/// AddLiveIn - This helper function adds the specified physical register to the
/// MachineFunction as a live in value. It also creates a corresponding virtual
@@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
return VReg;
}
+// align stack arguments according to platform alignment needed for tail calls
+unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG);
+
SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo *MFI,
@@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
MachineFrameInfo *MFI = MF.getFrameInfo();
SDOperand Root = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
-
+ unsigned CC = MF.getFunction()->getCallingConv();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ CCState CCInfo(CC, isVarArg,
getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
-
+ // Check for possible tail call calling convention.
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+
SmallVector<SDOperand, 8> ArgValues;
unsigned LastVal = ~0U;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
}
unsigned StackSize = CCInfo.getNextStackOffset();
+ // align stack specially for tail calls
+ if (CC==CallingConv::Fast)
+ StackSize = GetAlignedArgumentStackSize(StackSize,DAG);
ArgValues.push_back(Root);
@@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
if (isVarArg)
VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
- if (isStdCall && !isVarArg) {
+ // Tail call calling convention (CallingConv::Fast) does not support varargs.
+ assert( !(isVarArg && CC == CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
+
+ if (isStdCall && !isVarArg &&
+ (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) {
BytesToPopOnReturn = StackSize; // Callee pops everything..
BytesCallerReserves = 0;
} else {
@@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
unsigned CC) {
SDOperand Chain = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
- bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
SDOperand Callee = Op.getOperand(4);
unsigned NumOps = (Op.getNumOperands() - 5) / 2;
-
+
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+ if(CC==CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (CC==CallingConv::Fast)
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
@@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
-
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
- NodeTys, &Ops[0], Ops.size());
+
+ Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPush = 0;
- if (CC == CallingConv::X86_StdCall) {
+ if (CC == CallingConv::X86_StdCall ||
+ (CC == CallingConv::Fast && PerformTailCallOpt)) {
if (isVarArg)
NumBytesForCalleeToPush = isSRet ? 4 : 0;
else
NumBytesForCalleeToPush = NumBytes;
+ assert(!(isVarArg && CC==CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
} else {
// If this is is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
@@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((StackSize & 7) == 0)
StackSize += 4;
}
@@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((NumBytes & 7) == 0)
NumBytes += 4;
}
@@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
- // FIXME: Do not generate X86ISD::TAILCALL for now.
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ assert(isTailCall==false && "no tail call here");
+ Chain = DAG.getNode(X86ISD::CALL,
NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
@@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
}
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// * elf/pic is disabled OR
+// * elf/pic enabled + callee is in module + callee has
+// visibility protected or hidden
+// To ensure the stack is aligned according to platform abi pass
+// tail-call-align-stack. This makes sure that argument delta is always
+// multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for
+// example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achived by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) {
+ if (PerformTailCallOpt) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4;
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ StackSize = Offset;
+ }
+ return StackSize;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+// following the call is a return. A function is eligible if caller/callee
+// calling conventions match, currently only fastcc supports tail calls, and the
+// function CALL is immediatly followed by a RET.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call,
+ SDOperand Ret,
+ SelectionDAG& DAG) const {
+ bool IsEligible = false;
+
+ // Check whether CALL node immediatly preceeds the RET node and whether the
+ // return uses the result of the node or is a void return.
+ if ((Ret.getNumOperands() == 1 &&
+ (Ret.getOperand(0)== SDOperand(Call.Val,1) ||
+ Ret.getOperand(0)== SDOperand(Call.Val,0))) ||
+ (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
+ Ret.getOperand(1)== SDOperand(Call.Val,0))) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned CallerCC = MF.getFunction()->getCallingConv();
+ unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
+ if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+ SDOperand Callee = Call.getOperand(4);
+ // On elf/pic %ebx needs to be livein.
+ if(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ // Can only do local tail calls with PIC.
+ GlobalValue * GV = 0;
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if(G != 0 &&
+ (GV = G->getGlobal()) &&
+ (GV->hasHiddenVisibility() || GV->hasProtectedVisibility()))
+ IsEligible=true;
+ } else {
+ IsEligible=true;
+ }
+ }
+ }
+ return IsEligible;
+}
+
+SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op,
+ SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+ bool is64Bit = Subtarget->is64Bit();
+
+ assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls.");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ if (is64Bit)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+
+
+ // Lower arguments at fp - stackoffset + fpdiff.
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ unsigned NumBytesToBePushed =
+ GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG);
+
+ unsigned NumBytesCallerPushed =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ int FPDiff = NumBytesCallerPushed - NumBytesToBePushed;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+ MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+
+ // Adjust the ret address stack slot.
+ if (FPDiff) {
+ MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32;
+ SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG);
+ RetAddrFrIdx =
+ DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0);
+ // Emit a store of the saved ret value to the new location.
+ int SlotSize = is64Bit ? 8 : 4;
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
+ SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0);
+ }
+
+ Chain = DAG.
+ getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+ SmallVector<SDOperand, 8> MemOpChains2;
+ SDOperand FramePtr, StackPtr;
+ SDOperand PtrOff;
+ SDOperand FIN;
+ int FI = 0;
+
+ // Walk the register/memloc assignments, inserting copies/loads. Lower
+ // arguments first to the stack slot where they would normally - in case of a
+ // normal function call - be.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+
+ MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
+ Arg));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag = SDOperand();
+ // Copy from stack slots to stack slot of a tail called function. This needs
+ // to be done because if we would lower the arguments directly to their real
+ // stack slot we might end up overwriting each other.
+ // TODO: To make this more efficient (sometimes saving a store/load) we could
+ // analyse the arguments and emit this store/load/store sequence only for
+ // arguments which would be overwritten otherwise.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc()) {
+ SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
+ unsigned Flags = cast<ConstantSDNode>(FlagsOp)->getValue();
+
+ // Get source stack slot.
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+ FIN = DAG.getFrameIndex(FI, MVT::i32);
+ if (Flags & ISD::ParamFlags::ByVal) {
+ // Copy relative to framepointer.
+ unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >>
+ ISD::ParamFlags::ByValAlignOffs);
+
+ unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >>
+ ISD::ParamFlags::ByValSizeOffs;
+
+ SDOperand AlignNode = DAG.getConstant(Align, MVT::i32);
+ SDOperand SizeNode = DAG.getConstant(Size, MVT::i32);
+ // Copy relative to framepointer.
+ MemOpChains2.push_back(DAG.getNode(ISD::MEMCPY, MVT::Other, Chain, FIN,
+ PtrOff, SizeNode, AlignNode));
+ } else {
+ SDOperand LoadedArg = DAG.getLoad(VA.getValVT(), Chain, PtrOff, NULL,0);
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(Chain, LoadedArg, FIN, NULL, 0));
+ }
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains2[0], MemOpChains.size());
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ // Does not work with tail call since ebx is not restored correctly by
+ // tailcaller. TODO: at least for x86 - verify for x86-64
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+ else {
+ assert(Callee.getOpcode() == ISD::LOAD &&
+ "Function destination must be loaded into virtual register");
+ unsigned Opc = is64Bit ? X86::R9 : X86::ECX;
+
+ Chain = DAG.getCopyToReg(Chain,
+ DAG.getRegister(Opc, getPointerTy()) ,
+ Callee,InFlag);
+ Callee = DAG.getRegister(Opc, getPointerTy());
+ // Add register as live out.
+ DAG.getMachineFunction().addLiveOut(Opc);
+ }
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(NumBytesToBePushed, getPointerTy()));
+ Ops.push_back(DAG.getConstant(0, getPointerTy()));
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Returns a chain & a flag for retval copy to use.
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+ Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+ assert(InFlag.Val &&
+ "Flag must be set. Depend on flag being set in LowerRET");
+ Chain = DAG.getNode(X86ISD::TAILCALL,
+ Op.Val->getVTList(), &Ops[0], Ops.size());
+
+ return SDOperand(Chain.Val, Op.ResNo);
+}
//===----------------------------------------------------------------------===//
// X86-64 C Calling Convention implementation
@@ -1323,6 +1699,7 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
MachineFrameInfo *MFI = MF.getFrameInfo();
SDOperand Root = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ unsigned CC= MF.getFunction()->getCallingConv();
static const unsigned GPR64ArgRegs[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
@@ -1335,9 +1712,12 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ CCState CCInfo(CC, isVarArg,
getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
SmallVector<SDOperand, 8> ArgValues;
unsigned LastVal = ~0U;
@@ -1398,10 +1778,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
}
unsigned StackSize = CCInfo.getNextStackOffset();
+ if (CC==CallingConv::Fast)
+ StackSize =GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
if (isVarArg) {
+ assert(CC!=CallingConv::Fast
+ && "Var arg not supported with calling convention fastcc");
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
@@ -1446,10 +1830,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
}
ArgValues.push_back(Root);
-
- BytesToPopOnReturn = 0; // Callee pops nothing.
- BytesCallerReserves = StackSize;
-
+ // Tail call convention (fastcc) needs callee pop.
+ if (CC == CallingConv::Fast && PerformTailCallOpt){
+ BytesToPopOnReturn = StackSize; // Callee pops everything.
+ BytesCallerReserves = 0;
+ } else {
+ BytesToPopOnReturn = 0; // Callee pops nothing.
+ BytesCallerReserves = StackSize;
+ }
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
@@ -1463,16 +1851,21 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
unsigned CC) {
SDOperand Chain = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
- bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
SDOperand Callee = Op.getOperand(4);
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);
+ if (CC==CallingConv::Fast)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (CC == CallingConv::Fast)
+ NumBytes = GetAlignedArgumentStackSize(NumBytes,DAG);
+
Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
@@ -1526,6 +1919,9 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
}
if (isVarArg) {
+ assert ( CallingConv::Fast != CC &&
+ "Var args not supported with calling convention fastcc");
+
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -1574,17 +1970,22 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
- // FIXME: Do not generate X86ISD::TAILCALL for now.
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ Chain = DAG.getNode(X86ISD::CALL,
NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
-
+ int NumBytesForCalleeToPush = 0;
+ if (CC==CallingConv::Fast) {
+ NumBytesForCalleeToPush = NumBytes; // Callee pops everything
+
+ } else {
+ NumBytesForCalleeToPush = 0; // Callee pops nothing.
+ }
// Returns a flag for retval copy to use.
NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
Ops.clear();
Ops.push_back(Chain);
Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
- Ops.push_back(DAG.getConstant(0, getPointerTy()));
+ Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
Ops.push_back(InFlag);
Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
@@ -3106,10 +3507,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
// SHUFPS the element to the lowest double word, then movss.
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
SmallVector<SDOperand, 8> IdxVec;
- IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
- IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
- IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
- IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.
+ push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
+ IdxVec.
+ push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.
+ push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.
+ push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
&IdxVec[0], IdxVec.size());
Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
@@ -3128,7 +3533,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
SmallVector<SDOperand, 8> IdxVec;
IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
- IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.
+ push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
&IdxVec[0], IdxVec.size());
Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
@@ -3777,17 +4183,23 @@ SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) {
}
SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
- unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue();
-
- if (Subtarget->is64Bit())
- return LowerX86_64CCCCallTo(Op, DAG, CallingConv);
+ unsigned CallingConv = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+
+ if (Subtarget->is64Bit())
+ if(CallingConv==CallingConv::Fast && isTailCall && PerformTailCallOpt)
+ return LowerX86_TailCallTo(Op, DAG, CallingConv);
+ else
+ return LowerX86_64CCCCallTo(Op, DAG, CallingConv);
else
switch (CallingConv) {
default:
assert(0 && "Unsupported calling convention");
case CallingConv::Fast:
- // TODO: Implement fastcc
- // Falls through
+ if (isTailCall && PerformTailCallOpt)
+ return LowerX86_TailCallTo(Op, DAG, CallingConv);
+ else
+ return LowerCCCCallTo(Op,DAG, CallingConv);
case CallingConv::C:
case CallingConv::X86_StdCall:
return LowerCCCCallTo(Op, DAG, CallingConv);
@@ -3855,8 +4267,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
default:
assert(0 && "Unsupported calling convention");
case CallingConv::Fast:
- // TODO: implement fastcc.
-
+ return LowerCCCArguments(Op,DAG, true);
// Falls through
case CallingConv::C:
return LowerCCCArguments(Op, DAG);
@@ -4176,7 +4587,8 @@ X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) {
SDOperand TheOp = Op.getOperand(0);
SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1);
if (Subtarget->is64Bit()) {
- SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
+ SDOperand Copy1 =
+ DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX,
MVT::i64, Copy1.getValue(2));
SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2,
@@ -4612,6 +5024,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
}
}
@@ -4885,7 +5298,7 @@ static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {
i %= NumElems;
if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
return (i == 0)
- ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+ ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
} else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
SDOperand Idx = PermMask.getOperand(i);
if (Idx.getOpcode() == ISD::UNDEF)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index aa579d69f34..7123adaad27 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -181,7 +181,14 @@ namespace llvm {
TLSADDR, THREAD_POINTER,
// Exception Handling helpers
- EH_RETURN
+ EH_RETURN,
+
+ // tail call return
+ // oeprand #0 chain
+ // operand #1 callee (register or absolute)
+ // operand #2 stack adjustment
+ // operand #3 optional in flag
+ TC_RETURN
};
}
@@ -285,6 +292,7 @@ namespace llvm {
unsigned VarArgsFPOffset; // X86-64 vararg func fp reg offset.
int BytesToPopOnReturn; // Number of arg bytes ret should pop.
int BytesCallerReserves; // Number of arg bytes caller makes.
+
public:
explicit X86TargetLowering(TargetMachine &TM);
@@ -364,6 +372,14 @@ namespace llvm {
virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
MVT::ValueType EVT,
SelectionDAG &DAG) const;
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Target which want to do tail call
+ /// optimization should implement this function.
+ virtual bool IsEligibleForTailCallOptimization(SDOperand Call,
+ SDOperand Ret,
+ SelectionDAG &DAG) const;
+
private:
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
/// make the right decision when generating code for different targets.
@@ -372,7 +388,7 @@ namespace llvm {
/// X86StackPtr - X86 physical register used as stack ptr.
unsigned X86StackPtr;
-
+
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
/// floating point ops.
/// When SSE is available, use it for f32 operations.
@@ -402,6 +418,10 @@ namespace llvm {
SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);
SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC);
+ // fast calling convention (tail call) implementation for 32/64bit
+ SDOperand LowerX86_TailCallTo(SDOperand Op,
+ SelectionDAG & DAG, unsigned CC);
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);
// Fast and FastCall Calling Convention implementation.
SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);
SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 986aa0bc806..9d5e6371199 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -706,6 +706,8 @@ bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
if (MBB.empty()) return false;
switch (MBB.back().getOpcode()) {
+ case X86::TCRETURNri:
+ case X86::TCRETURNdi:
case X86::RET: // Return.
case X86::RETI:
case X86::TAILJMPd:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 641eb2f7cc6..a6bd3fbbbde 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -55,6 +55,8 @@ def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
@@ -73,7 +75,7 @@ def X86callseq_start :
[SDNPHasChain, SDNPOutFlag]>;
def X86callseq_end :
SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
- [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+ [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
[SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
@@ -99,6 +101,8 @@ def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;
def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
[SDNPHasChain]>;
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInFlag]>;
//===----------------------------------------------------------------------===//
// X86 Operand Definitions.
@@ -356,15 +360,30 @@ let isCall = 1 in
}
// Tail call stuff.
+
+def TAILCALL : I<0, Pseudo, (outs), (ins ),
+ "#TAILCALL",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi : I<0, Pseudo, (outs), (ins i32imm:$dst, i32imm:$offset),
+ "#TC_RETURN $dst $offset",
+ []>;
+
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
- def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call} # TAIL CALL",
+def TCRETURNri : I<0, Pseudo, (outs), (ins GR32:$dst, i32imm:$offset),
+ "#TC_RETURN $dst $offset",
[]>;
+
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
- def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp\t{*}$dst # TAIL CALL",
+ def TAILJMPd : IBr<0xE9, (ins i32imm:$dst), "jmp\t${dst:call} # TAILCALL",
[]>;
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst # TAILCALL",
+ []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem:$dst),
- "jmp\t{*}$dst # TAIL CALL", []>;
+ "jmp\t{*}$dst # TAILCALL", []>;
//===----------------------------------------------------------------------===//
// Miscellaneous Instructions...
@@ -2507,13 +2526,23 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
(MOV32mi addr:$dst, texternalsym:$src)>;
// Calls
+// tailcall stuff
def : Pat<(X86tailcall GR32:$dst),
- (CALL32r GR32:$dst)>;
+ (TAILCALL)>;
def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
- (CALLpcrel32 tglobaladdr:$dst)>;
+ (TAILCALL)>;
def : Pat<(X86tailcall (i32 texternalsym:$dst)),
- (CALLpcrel32 texternalsym:$dst)>;
+ (TAILCALL)>;
+
+def : Pat<(X86tcret GR32:$dst, imm:$off),
+ (TCRETURNri GR32:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>;
def : Pat<(X86call (i32 tglobaladdr:$dst)),
(CALLpcrel32 tglobaladdr:$dst)>;
diff --git a/llvm/lib/Target/X86/X86InstrX86-64.td b/llvm/lib/Target/X86/X86InstrX86-64.td
index f6f48a21d47..f501b5ec558 100644
--- a/llvm/lib/Target/X86/X86InstrX86-64.td
+++ b/llvm/lib/Target/X86/X86InstrX86-64.td
@@ -102,6 +102,23 @@ let isCall = 1 in
"call\t{*}$dst", []>;
}
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi64 : I<0, Pseudo, (outs), (ins i64imm:$dst, i32imm:$offset),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64:$dst, i32imm:$offset),
+ "#TC_RETURN $dst $offset",
+ []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst # TAILCALL",
+ []>;
+
// Branches
let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
@@ -1105,6 +1122,24 @@ def : Pat<(X86tailcall (i64 texternalsym:$dst)),
def : Pat<(X86tailcall GR64:$dst),
(CALL64r GR64:$dst)>;
+
+// tailcall stuff
+def : Pat<(X86tailcall GR32:$dst),
+ (TAILCALL)>;
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+ (TAILCALL)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+ (TAILCALL)>;
+
+def : Pat<(X86tcret GR64:$dst, imm:$off),
+ (TCRETURNri64 GR64:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
// Comparisons.
// TEST R,R is smaller than CMP R,0
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index e50a104f216..05972c66c27 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -47,18 +47,26 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
// FrameIndex for return slot.
int ReturnAddrIndex;
+
+ // Delta the ReturnAddr stack slot is moved
+ // Used for creating an area before the register spill area on the stack
+ // the returnaddr can be savely move to this area
+ int TailCallReturnAddrDelta;
+
public:
X86MachineFunctionInfo() : ForceFramePointer(false),
CalleeSavedFrameSize(0),
BytesToPopOnReturn(0),
DecorationStyle(None),
- ReturnAddrIndex(0) {}
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0){}
X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
CalleeSavedFrameSize(0),
BytesToPopOnReturn(0),
DecorationStyle(None),
- ReturnAddrIndex(0) {}
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0) {}
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -74,6 +82,9 @@ public:
int getRAIndex() const { return ReturnAddrIndex; }
void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
};
} // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 83cb03c76f8..f017d4020ae 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1436,18 +1436,42 @@ void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
if (!hasFP(MF))
Offset += MF.getFrameInfo()->getStackSize();
- else
+ else {
Offset += SlotSize; // Skip the saved EBP
-
+ // Skip the RETADDR move area
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta;
+ }
+
MI.getOperand(i+3).ChangeToImmediate(Offset);
}
void
X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MF.getFrameInfo()->
+ CreateFixedObject(-TailCallReturnAddrDelta,
+ (-1*SlotSize)+TailCallReturnAddrDelta);
+ }
if (hasFP(MF)) {
+ assert((TailCallReturnAddrDelta <= 0) &&
+ "The Delta should always be zero or negative");
// Create a frame entry for the EBP register that must be saved.
int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
- (int)SlotSize * -2);
+ (int)SlotSize * -2+
+ TailCallReturnAddrDelta);
assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
"Slot for EBP register must be last in order to be found!");
}
@@ -1530,6 +1554,41 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
}
}
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB instruction it is deleted
+/// argument and the stack adjustment is returned as a positive value for ADD
+/// and a negative for SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr,
+ bool doMergeWithPrevious) {
+
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ int Offset = 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB
MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1543,10 +1602,23 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
// Prepare for frame info.
unsigned FrameLabelId = 0;
- // Get the number of bytes to allocate from the FrameInfo
+ // Get the number of bytes to allocate from the FrameInfo.
uint64_t StackSize = MFI->getStackSize();
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));
uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
+ StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
+ }
+
if (hasFP(MF)) {
// Get the offset of the stack slot for the EBP register... which is
// guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
@@ -1615,6 +1687,10 @@ void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
MBB.insert(MBBI, MI);
}
} else {
+ // If there is an SUB32ri of ESP immediately before this instruction,
+ // merge the two. This can be the case when tail call elimination is
+ // enabled and the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
// If there is an ADD32ri or SUB32ri of ESP immediately after this
// instruction, merge the two instructions.
mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
@@ -1711,6 +1787,10 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
switch (RetOpcode) {
case X86::RET:
case X86::RETI:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNdi64:
case X86::EH_RETURN:
case X86::TAILJMPd:
case X86::TAILJMPr:
@@ -1773,7 +1853,46 @@ void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
MachineOperand &DestAddr = MBBI->getOperand(0);
assert(DestAddr.isRegister() && "Offset should be in register!");
BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
- addReg(DestAddr.getReg());
+ addReg(DestAddr.getReg());
+ // Tail call return: adjust the stack pointer and jump to callee
+ } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+ RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
+ MBBI = prior(MBB.end());
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(1);
+ assert( StackAdjust.isImmediate() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+ // Incoporate the retaddr area.
+ Offset = StackAdj-MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+ if (Offset) {
+ // Check for possible merge with preceeding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
+ }
+ // Jump to label or value in register.
+ if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64)
+ BuildMI(MBB, MBBI, TII.get(X86::TAILJMPd)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+ else if (RetOpcode== X86::TCRETURNri64) {
+ BuildMI(MBB, MBBI, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+ } else
+ BuildMI(MBB, MBBI, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+ (X86FI->getTCReturnAddrDelta() < 0)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int delta = -1*X86FI->getTCReturnAddrDelta();
+ MBBI = prior(MBB.end());
+ // Check for possible merge with preceeding ADD instruction.
+ delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
}
}
OpenPOWER on IntegriCloud