31 files changed, 703 insertions, 19 deletions
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 4e653ae55d5..dce84620fd7 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -794,6 +794,7 @@ function. The operand fields are:
   * ``preserve_allcc``: code 15
   * ``swiftcc`` : code 16
   * ``cxx_fast_tlscc``: code 17
+  * ``tailcc`` : code 18
   * ``x86_stdcallcc``: code 64
   * ``x86_fastcallcc``: code 65
   * ``arm_apcscc``: code 66
diff --git a/llvm/docs/CodeGenerator.rst b/llvm/docs/CodeGenerator.rst
index 343b9879972..75330a5df3b 100644
--- a/llvm/docs/CodeGenerator.rst
+++ b/llvm/docs/CodeGenerator.rst
@@ -2068,12 +2068,12 @@ supported on x86/x86-64, PowerPC, and WebAssembly. It is performed on x86/x86-64
 and PowerPC if:
 
 * Caller and callee have the calling convention ``fastcc``, ``cc 10`` (GHC
-  calling convention) or ``cc 11`` (HiPE calling convention).
+  calling convention), ``cc 11`` (HiPE calling convention), or ``tailcc``.
 
 * The call is a tail call - in tail position (ret immediately follows call and
   ret uses value of call or is void).
 
-* Option ``-tailcallopt`` is enabled.
+* Option ``-tailcallopt`` is enabled or the calling convention is ``tailcc``.
 
 * Platform-specific constraints are met.
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index d9a38907c92..e797b1f9a15 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -299,7 +299,7 @@ added in the future:
     allows the target to use whatever tricks it wants to produce fast
     code for the target, without having to conform to an externally
     specified ABI (Application Binary Interface). `Tail calls can only
-    be optimized when this, the GHC or the HiPE convention is
+    be optimized when this, the tailcc, the GHC or the HiPE convention is
     used. <CodeGenerator.html#id80>`_ This calling convention does not
     support varargs and requires the prototype of all callees to exactly
     match the prototype of the function definition.
@@ -436,6 +436,14 @@ added in the future:
     - On X86-64 RCX and R8 are available for additional integer returns, and
       XMM2 and XMM3 are available for additional FP/vector returns.
     - On iOS platforms, we use AAPCS-VFP calling convention.
+"``tailcc``" - Tail callable calling convention
+    This calling convention ensures that calls in tail position will always be
+    tail call optimized. This calling convention is equivalent to fastcc,
+    except for an additional guarantee that tail calls will be produced
+    whenever possible. `Tail calls can only be optimized when this, the fastcc,
+    the GHC or the HiPE convention is used. <CodeGenerator.html#id80>`_ This
+    calling convention does not support varargs and requires the prototype of
+    all callees to exactly match the prototype of the function definition.
 "``cc <n>``" - Numbered convention
     Any calling convention may be specified by number, allowing
     target-specific calling conventions to be used. Target specific
@@ -10232,11 +10240,12 @@ This instruction requires several arguments:
    Tail call optimization for calls marked ``tail`` is guaranteed to occur if
    the following conditions are met:
 
-   -  Caller and callee both have the calling convention ``fastcc``.
+   -  Caller and callee both have the calling convention ``fastcc`` or ``tailcc``.
    -  The call is in tail position (ret immediately follows call and ret
       uses value of call or is void).
-   -  Option ``-tailcallopt`` is enabled, or
-      ``llvm::GuaranteedTailCallOpt`` is ``true``.
+   -  Option ``-tailcallopt`` is enabled,
+      ``llvm::GuaranteedTailCallOpt`` is ``true``, or the calling convention
+      is ``tailcc``
    -  `Platform-specific constraints are
       met. <CodeGenerator.html#tailcallopt>`_
 
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index 6f4989268fa..c1c979c2e2a 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -75,6 +75,11 @@ namespace CallingConv {
     // CXX_FAST_TLS - Calling convention for access functions.
     CXX_FAST_TLS = 17,
 
+    /// Tail - This calling convention attemps to make calls as fast as
+    /// possible while guaranteeing that tail call optimization can always
+    /// be performed.
+    Tail = 18,
+
     // Target - This is the start of the target-specific calling conventions,
     // e.g. fastcall and thiscall on X86.
     FirstTargetCC = 64,
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 72d2357c293..5292b0e6274 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -622,6 +622,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(amdgpu_ps);
   KEYWORD(amdgpu_cs);
   KEYWORD(amdgpu_kernel);
+  KEYWORD(tailcc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5ea0b7d39c1..9bb3ca145c2 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1955,6 +1955,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'amdgpu_ps'
 ///   ::= 'amdgpu_cs'
 ///   ::= 'amdgpu_kernel'
+///   ::= 'tailcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
@@ -2000,6 +2001,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_amdgpu_ps:      CC = CallingConv::AMDGPU_PS; break;
   case lltok::kw_amdgpu_cs:      CC = CallingConv::AMDGPU_CS; break;
   case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
+  case lltok::kw_tailcc:         CC = CallingConv::Tail; break;
   case lltok::kw_cc: {
       Lex.Lex();
       return ParseUInt32(CC);
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index 0e9ba4db474..f49feb2dc14 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -168,6 +168,7 @@ enum Kind {
   kw_amdgpu_ps,
   kw_amdgpu_cs,
   kw_amdgpu_kernel,
+  kw_tailcc,
 
   // Attributes:
   kw_attributes,
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index 3ef90d32daf..6c059665fca 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -523,7 +523,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
+      ((!TM.Options.GuaranteedTailCallOpt &&
+        CS.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
     return false;
 
   // If I will have a chain, make sure no other instruction that will have a
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 3f140ba01d8..91f22dbb17a 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -352,6 +352,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::PreserveAll:   Out << "preserve_allcc"; break;
   case CallingConv::CXX_FAST_TLS:  Out << "cxx_fast_tlscc"; break;
   case CallingConv::GHC:           Out << "ghccc"; break;
+  case CallingConv::Tail:          Out << "tailcc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 1c3034a5116..4c49d68bec9 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -433,6 +433,7 @@ defm X86_SysV64_RegCall :
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
   CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
   // If HiPE, use RetCC_X86_32_HiPE.
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
@@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 97abd084bf5..e5e089d07d5 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
+      CC != CallingConv::Tail &&
       CC != CallingConv::X86_FastCall &&
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
@@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+  if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+      CC == CallingConv::Tail)
     return false;
 
   // Let SDISel handle vararg functions.
@@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
   if (Subtarget->getTargetTriple().isOSMSVCRT())
     return 0;
   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
-      CC == CallingConv::HiPE)
+      CC == CallingConv::HiPE || CC == CallingConv::Tail)
     return 0;
 
   if (CS)
@@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   default: return false;
   case CallingConv::C:
   case CallingConv::Fast:
+  case CallingConv::Tail:
   case CallingConv::WebKit_JS:
   case CallingConv::Swift:
   case CallingConv::X86_FastCall:
@@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
-  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+  if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+      CC == CallingConv::Tail)
     return false;
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index af3a33ffd4e..fabc3e581ff 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2269,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr
   bool IsNested = HasNestArgument(&MF);
 
   if (CallingConvention == CallingConv::X86_FastCall ||
-      CallingConvention == CallingConv::Fast) {
+      CallingConvention == CallingConv::Fast ||
+      CallingConvention == CallingConv::Tail) {
     if (IsNested)
       report_fatal_error("Segmented stacks does not support fastcall with "
                          "nested function.");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3806b0e2330..052300d6f72 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2963,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
 static bool canGuaranteeTCO(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
-          CC == CallingConv::HHVM);
+          CC == CallingConv::HHVM || CC == CallingConv::Tail);
 }
 
 /// Return true if we might ever do TCO for calls with this calling convention.
@@ -2989,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
 /// Return true if the function is being made into a tailcall target by
 /// changing its ABI.
 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
-  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
@@ -3615,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
+  bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+      CallConv == CallingConv::Tail;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
@@ -3635,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
-  if (Subtarget.isPICStyleGOT() &&
-      !MF.getTarget().Options.GuaranteedTailCallOpt) {
+  if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
     // relocation, which forces early binding of the symbol. This breaks code
@@ -3663,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
-    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+    if (!IsGuaranteeTCO && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
@@ -3695,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
-  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
-           canGuaranteeTCO(CallConv))
+  else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
@@ -4321,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   bool CCMatch = CallerCC == CalleeCC;
   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+  bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+      CalleeCC == CallingConv::Tail;
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
@@ -4328,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
   if (IsCalleeWin64 != IsCallerWin64)
     return false;
 
-  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+  if (IsGuaranteeTCO) {
     if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
@@ -24421,6 +24423,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     case CallingConv::X86_FastCall:
     case CallingConv::X86_ThisCall:
     case CallingConv::Fast:
+    case CallingConv::Tail:
       // Pass 'nest' parameter in EAX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::EAX;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 4d7495641d9..b5b1c19c455 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -815,6 +815,7 @@ public:
     // On Win64, all these conventions just use the default convention.
     case CallingConv::C:
     case CallingConv::Fast:
+    case CallingConv::Tail:
     case CallingConv::Swift:
     case CallingConv::X86_FastCall:
     case CallingConv::X86_StdCall:
diff --git a/llvm/test/CodeGen/X86/musttail-tailcc.ll b/llvm/test/CodeGen/X86/musttail-tailcc.ll
new file mode 100644
index 00000000000..6057045a77d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/musttail-tailcc.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown   | FileCheck %s -check-prefix=X32
+
+; tailcc will turn all of these musttail calls into tail calls.
+
+declare tailcc i32 @tailcallee(i32 %a1, i32 %a2)
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp tailcallee # TAILCALL
+;
+; X32-LABEL: tailcaller:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    jmp tailcallee # TAILCALL
+entry:
+  %tmp11 = musttail call tailcc i32 @tailcallee(i32 %in1, i32 %in2)
+  ret i32 %tmp11
+}
+
+declare tailcc i8* @alias_callee()
+
+define tailcc noalias i8* @noalias_caller() nounwind {
+; X64-LABEL: noalias_caller:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp alias_callee # TAILCALL
+;
+; X32-LABEL: noalias_caller:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp alias_callee # TAILCALL
+  %p = musttail call tailcc i8* @alias_callee()
+  ret i8* %p
+}
+
+declare tailcc noalias i8* @noalias_callee()
+
+define tailcc i8* @alias_caller() nounwind {
+; X64-LABEL: alias_caller:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp noalias_callee # TAILCALL
+;
+; X32-LABEL: alias_caller:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp noalias_callee # TAILCALL
+  %p = musttail call tailcc noalias i8* @noalias_callee()
+  ret i8* %p
+}
+
+define tailcc void @void_test(i32, i32, i32, i32) {
+; X64-LABEL: void_test:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp void_test # TAILCALL
+;
+; X32-LABEL: void_test:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp void_test # TAILCALL
+  entry:
+   musttail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+   ret void
+}
+
+define tailcc i1 @i1test(i32, i32, i32, i32) {
+; X64-LABEL: i1test:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp i1test # TAILCALL
+;
+; X32-LABEL: i1test:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp i1test # TAILCALL
+  entry:
+  %4 = musttail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+  ret i1 %4
+}
diff --git a/llvm/test/CodeGen/X86/tailcall-tailcc.ll b/llvm/test/CodeGen/X86/tailcall-tailcc.ll
new file mode 100644
index 00000000000..5a427034a72
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcall-tailcc.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown   | FileCheck %s -check-prefix=X32
+
+; With -tailcallopt, CodeGen guarantees a tail call optimization
+; for all of these.
+
+declare tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp tailcallee # TAILCALL
+;
+; X32-LABEL: tailcaller:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    jmp tailcallee # TAILCALL
+entry:
+  %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+  ret i32 %tmp11
+}
+
+declare tailcc i8* @alias_callee()
+
+define tailcc noalias i8* @noalias_caller() nounwind {
+; X64-LABEL: noalias_caller:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp alias_callee # TAILCALL
+;
+; X32-LABEL: noalias_caller:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp alias_callee # TAILCALL
+  %p = tail call tailcc i8* @alias_callee()
+  ret i8* %p
+}
+
+declare tailcc noalias i8* @noalias_callee()
+
+define tailcc i8* @alias_caller() nounwind {
+; X64-LABEL: alias_caller:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp noalias_callee # TAILCALL
+;
+; X32-LABEL: alias_caller:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp noalias_callee # TAILCALL
+  %p = tail call tailcc noalias i8* @noalias_callee()
+  ret i8* %p
+}
+
+declare tailcc i32 @i32_callee()
+
+define tailcc i32 @ret_undef() nounwind {
+; X64-LABEL: ret_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp i32_callee # TAILCALL
+;
+; X32-LABEL: ret_undef:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp i32_callee # TAILCALL
+  %p = tail call tailcc i32 @i32_callee()
+  ret i32 undef
+}
+
+declare tailcc void @does_not_return()
+
+define tailcc i32 @noret() nounwind {
+; X64-LABEL: noret:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    popq %rax
+; X64-NEXT:    jmp does_not_return # TAILCALL
+;
+; X32-LABEL: noret:
+; X32:       # %bb.0:
+; X32-NEXT:    jmp does_not_return # TAILCALL
+  tail call tailcc void @does_not_return()
+  unreachable
+}
+
+define tailcc void @void_test(i32, i32, i32, i32) {
+; X64-LABEL: void_test:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp void_test # TAILCALL
+;
+; X32-LABEL: void_test:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp void_test # TAILCALL
+  entry:
+   tail call tailcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+   ret void
+}
+
+define tailcc i1 @i1test(i32, i32, i32, i32) {
+; X64-LABEL: i1test:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp i1test # TAILCALL
+;
+; X32-LABEL: i1test:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    jmp i1test # TAILCALL
+  entry:
+  %4 = tail call tailcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+  ret i1 %4
+}
diff --git a/llvm/test/CodeGen/X86/tailcc-calleesave.ll b/llvm/test/CodeGen/X86/tailcc-calleesave.ll
new file mode 100644
index 00000000000..09685fb17cb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-calleesave.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mcpu=core < %s | FileCheck %s
+
+target triple = "i686-apple-darwin"
+
+declare tailcc void @foo(i32, i32, i32, i32, i32, i32)
+declare i32* @bar(i32*)
+
+define tailcc void @hoge(i32 %b) nounwind {
+; Do not overwrite pushed callee-save registers
+; CHECK: pushl
+; CHECK: subl $[[SIZE:[0-9]+]], %esp
+; CHECK-NOT: [[SIZE]](%esp)
+  %a = alloca i32
+  store i32 0, i32* %a
+  %d = tail call i32* @bar(i32* %a) nounwind
+  store i32 %b, i32* %d
+  tail call tailcc void @foo(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6) nounwind
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/tailcc-disable-tail-calls.ll b/llvm/test/CodeGen/X86/tailcc-disable-tail-calls.ll
new file mode 100644
index 00000000000..3199b8c34b7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-disable-tail-calls.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=NO-OPTION
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls | FileCheck %s --check-prefix=DISABLE-TRUE
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls=false | FileCheck %s --check-prefix=DISABLE-FALSE
+
+; Check that command line option "-disable-tail-calls" overrides function
+; attribute "disable-tail-calls".
+
+; NO-OPTION-LABEL: {{\_?}}func_attr
+; NO-OPTION: callq {{\_?}}callee
+
+; DISABLE-FALSE-LABEL: {{\_?}}func_attr
+; DISABLE-FALSE: jmp {{\_?}}callee
+
+; DISABLE-TRUE-LABEL: {{\_?}}func_attr
+; DISABLE-TRUE: callq {{\_?}}callee
+
+define tailcc i32 @func_attr(i32 %a) #0 {
+entry:
+  %call = tail call tailcc i32 @callee(i32 %a)
+  ret i32 %call
+}
+
+; NO-OPTION-LABEL: {{\_?}}func_noattr
+; NO-OPTION: jmp {{\_?}}callee
+
+; DISABLE-FALSE-LABEL: {{\_?}}func_noattr
+; DISABLE-FALSE: jmp {{\_?}}callee
+
+; DISABLE-TRUE-LABEL: {{\_?}}func_noattr
+; DISABLE-TRUE: callq {{\_?}}callee
+
+define tailcc i32 @func_noattr(i32 %a) {
+entry:
+  %call = tail call tailcc i32 @callee(i32 %a)
+  ret i32 %call
+}
+
+declare tailcc i32 @callee(i32)
+
+attributes #0 = { "disable-tail-calls"="true" }
diff --git a/llvm/test/CodeGen/X86/tailcc-fastcc.ll b/llvm/test/CodeGen/X86/tailcc-fastcc.ll
new file mode 100644
index 00000000000..03369855de4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-fastcc.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -tailcallopt < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc -tailcallopt < %s -mtriple=i686-unknown-unknown   | FileCheck %s -check-prefix=X32
+
+; llc -tailcallopt should not enable tail calls from fastcc to tailcc or vice versa
+
+declare tailcc i32 @tailcallee1(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define fastcc i32 @tailcaller1(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    callq tailcallee1
+; X64-NEXT:    retq $8
+;
+; X32-LABEL: tailcaller1:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %edx
+; X32-NEXT:    pushl %ecx
+; X32-NEXT:    calll tailcallee1
+; X32-NEXT:    retl
+entry:
+  %tmp11 = tail call tailcc i32 @tailcallee1(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+  ret i32 %tmp11
+}
+
+declare fastcc i32 @tailcallee2(i32 %a1, i32 %a2, i32 %a3, i32 %a4)
+
+define tailcc i32 @tailcaller2(i32 %in1, i32 %in2) nounwind {
+; X64-LABEL: tailcaller2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    callq tailcallee2
+; X64-NEXT:    retq $8
+;
+; X32-LABEL: tailcaller2:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %edx
+; X32-NEXT:    pushl %ecx
+; X32-NEXT:    calll tailcallee2
+; X32-NEXT:    retl
+entry:
+  %tmp11 = tail call fastcc i32 @tailcallee2(i32 %in1, i32 %in2, i32 %in1, i32 %in2)
+  ret i32 %tmp11
+}
diff --git a/llvm/test/CodeGen/X86/tailcc-fastisel.ll b/llvm/test/CodeGen/X86/tailcc-fastisel.ll
new file mode 100644
index 00000000000..e6d75faf4cd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-fastisel.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fast-isel -fast-isel-abort=1 | FileCheck %s
+
+%0 = type { i64, i32, i8* }
+
+define tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 %arg1) nounwind {
+fail:                                             ; preds = %entry
+  %tmp20 = tail call tailcc i8* @"visit_array_aux<`Reference>"(%0 %arg, i32 undef) ; <i8*> [#uses=1]
+; CHECK: jmp "_visit_array_aux<`Reference>" ## TAILCALL
+  ret i8* %tmp20
+}
+
+define i32 @foo() nounwind {
+entry:
+ %0 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
+ ret i32 %0
+}
+
+declare i32 @bar(...) nounwind
diff --git a/llvm/test/CodeGen/X86/tailcc-largecode.ll b/llvm/test/CodeGen/X86/tailcc-largecode.ll
new file mode 100644
index 00000000000..a3b5c300745
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-largecode.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false | FileCheck %s
+
+declare tailcc i32 @callee(i32 %arg)
+define tailcc i32 @directcall(i32 %arg) {
+entry:
+; This is the large code model, so &callee may not fit into the jmp
+; instruction.  Instead, stick it into a register.
+;  CHECK: movabsq $callee, [[REGISTER:%r[a-z0-9]+]]
+;  CHECK: jmpq    *[[REGISTER]]  # TAILCALL
+  %res = tail call tailcc i32 @callee(i32 %arg)
+  ret i32 %res
+}
+
+; Check that the register used for an indirect tail call doesn't
+; clobber any of the arguments.
+define tailcc i32 @indirect_manyargs(i32(i32,i32,i32,i32,i32,i32,i32)* %target) {
+; Adjust the stack to enter the function.  (The amount of the
+; adjustment may change in the future, in which case the location of
+; the stack argument and the return adjustment will change too.)
+;  CHECK: pushq
+; Put the call target into R11, which won't be clobbered while restoring
+; callee-saved registers and won't be used for passing arguments.
+;  CHECK: movq %rdi, %rax
+; Pass the stack argument.
+;  CHECK: movl $7, 16(%rsp)
+; Pass the register arguments, in the right registers.
+;  CHECK: movl $1, %edi
+;  CHECK: movl $2, %esi
+;  CHECK: movl $3, %edx
+;  CHECK: movl $4, %ecx
+;  CHECK: movl $5, %r8d
+;  CHECK: movl $6, %r9d
+; Adjust the stack to "return".
+;  CHECK: popq
+; And tail-call to the target.
+;  CHECK: jmpq *%rax  # TAILCALL
+  %res = tail call tailcc i32 %target(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                      i32 6, i32 7)
+  ret i32 %res
+}
+
+; Check that the register used for a direct tail call doesn't clobber
+; any of the arguments.
+declare tailcc i32 @manyargs_callee(i32,i32,i32,i32,i32,i32,i32)
+define tailcc i32 @direct_manyargs() {
+; Adjust the stack to enter the function.  (The amount of the
+; adjustment may change in the future, in which case the location of
+; the stack argument and the return adjustment will change too.)
+;  CHECK: pushq
+; Pass the stack argument.
+;  CHECK: movl $7, 16(%rsp)
+; This is the large code model, so &manyargs_callee may not fit into
+; the jmp instruction.  Put it into a register which won't be clobbered
+; while restoring callee-saved registers and won't be used for passing
+; arguments.
+;  CHECK: movabsq $manyargs_callee, %rax
+; Pass the register arguments, in the right registers.
+;  CHECK: movl $1, %edi
+;  CHECK: movl $2, %esi
+;  CHECK: movl $3, %edx
+;  CHECK: movl $4, %ecx
+;  CHECK: movl $5, %r8d
+;  CHECK: movl $6, %r9d
+; Adjust the stack to "return".
+;  CHECK: popq
+; And tail-call to the target.
+;  CHECK: jmpq *%rax  # TAILCALL
+  %res = tail call tailcc i32 @manyargs_callee(i32 1, i32 2, i32 3, i32 4,
+                                               i32 5, i32 6, i32 7)
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/tailcc-stackalign.ll b/llvm/test/CodeGen/X86/tailcc-stackalign.ll
new file mode 100644
index 00000000000..36333a9a213
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-stackalign.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s  -mtriple=i686-unknown-linux -no-x86-call-frame-opt | FileCheck %s
+; Linux has 8 byte alignment so the params cause stack size 20, 
+; ensure that a normal tailcc call has matching stack size
+
+
+define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+       ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2, i32 %in3, i32 %in4) {
+       %tmp11 = tail call tailcc i32 @tailcallee(i32 %in1, i32 %in2,
+                                                 i32 %in1, i32 %in2)
+       ret i32 %tmp11
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+ %tmp1 = call tailcc i32 @tailcaller( i32 1, i32 2, i32 3, i32 4 )
+ ; expect match subl [stacksize] here
+ ret i32 0
+}
+
+; CHECK: calll tailcaller
+; CHECK-NEXT: subl $12
diff --git a/llvm/test/CodeGen/X86/tailcc-structret.ll b/llvm/test/CodeGen/X86/tailcc-structret.ll
new file mode 100644
index 00000000000..2d83d4a3c9f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcc-structret.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s
+define tailcc { { i8*, i8* }*, i8*} @init({ { i8*, i8* }*, i8*}, i32) {
+entry:
+      %2 = tail call tailcc { { i8*, i8* }*, i8* } @init({ { i8*, i8*}*, i8*} %0, i32 %1)
+      ret { { i8*, i8* }*, i8*} %2
+; CHECK: jmp init
+}
diff --git a/llvm/test/CodeGen/X86/tailccbyval.ll b/llvm/test/CodeGen/X86/tailccbyval.ll
new file mode 100644
index 00000000000..dbde868e511
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccbyval.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux | FileCheck %s
+%struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
+                  i32, i32, i32, i32, i32, i32, i32, i32,
+                  i32, i32, i32, i32, i32, i32, i32, i32 }
+
+define  tailcc i32 @tailcallee(%struct.s* byval %a) nounwind {
+entry:
+        %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 0
+        %tmp3 = load i32, i32* %tmp2
+        ret i32 %tmp3
+; CHECK: tailcallee
+; CHECK: movl 4(%esp), %eax
+}
+
+define  tailcc i32 @tailcaller(%struct.s* byval %a) nounwind {
+entry:
+        %tmp4 = tail call tailcc i32 @tailcallee(%struct.s* byval %a )
+        ret i32 %tmp4
+; CHECK: tailcaller
+; CHECK: jmp tailcallee
+}
diff --git a/llvm/test/CodeGen/X86/tailccbyval64.ll b/llvm/test/CodeGen/X86/tailccbyval64.ll
new file mode 100644
index 00000000000..47d20ea972a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccbyval64.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+
+; FIXME: Win64 does not support byval.
+
+; Expect the entry point.
+; CHECK-LABEL: tailcaller:
+
+; Expect 2 rep;movs because of tail call byval lowering.
+; CHECK: rep;
+; CHECK: rep;
+
+; A sequence of copyto/copyfrom virtual registers is used to deal with byval
+; lowering appearing after moving arguments to registers. The following two
+; checks verify that the register allocator changes those sequences to direct
+; moves to argument register where it can (for registers that are not used in 
+; byval lowering - not rsi, not rdi, not rcx).
+; Expect argument 4 to be moved directly to register edx.
+; CHECK: movl $7, %edx
+
+; Expect argument 6 to be moved directly to register r8.
+; CHECK: movl $17, %r8d
+
+; Expect not call but jmp to @tailcallee.
+; CHECK: jmp tailcallee
+
+; Expect the trailer.
+; CHECK: .size tailcaller
+
+%struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64,
+                   i64, i64, i64, i64, i64, i64, i64, i64,
+                   i64, i64, i64, i64, i64, i64, i64, i64 }
+
+declare  tailcc i64 @tailcallee(%struct.s* byval %a, i64 %val, i64 %val2, i64 %val3, i64 %val4, i64 %val5)
+
+
+define  tailcc i64 @tailcaller(i64 %b, %struct.s* byval %a) {
+entry:
+        %tmp2 = getelementptr %struct.s, %struct.s* %a, i32 0, i32 1
+        %tmp3 = load i64, i64* %tmp2, align 8
+        %tmp4 = tail call tailcc i64 @tailcallee(%struct.s* byval %a , i64 %tmp3, i64 %b, i64 7, i64 13, i64 17)
+        ret i64 %tmp4
+}
diff --git a/llvm/test/CodeGen/X86/tailccfp.ll b/llvm/test/CodeGen/X86/tailccfp.ll
new file mode 100644
index 00000000000..32814e93f45
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccfp.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+define tailcc i32 @bar(i32 %X, i32(double, i32) *%FP) {
+     %Y = tail call tailcc i32 %FP(double 0.0, i32 %X)
+     ret i32 %Y
+; CHECK: jmpl
+}
diff --git a/llvm/test/CodeGen/X86/tailccfp2.ll b/llvm/test/CodeGen/X86/tailccfp2.ll
new file mode 100644
index 00000000000..f8b29b386ad
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccfp2.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+
+declare i32 @putchar(i32)
+
+define tailcc i32 @checktail(i32 %x, i32* %f, i32 %g) nounwind {
+; CHECK-LABEL: checktail:
+        %tmp1 = icmp sgt i32 %x, 0
+        br i1 %tmp1, label %if-then, label %if-else
+
+if-then:
+        %fun_ptr = bitcast i32* %f to i32(i32, i32*, i32)* 
+        %arg1    = add i32 %x, -1
+        call i32 @putchar(i32 90)       
+; CHECK: jmpl *%e{{.*}}
+        %res = tail call tailcc i32 %fun_ptr( i32 %arg1, i32 * %f, i32 %g)
+        ret i32 %res
+
+if-else:
+        ret i32  %x
+}
+
+
+define i32 @main() nounwind { 
+ %f   = bitcast i32 (i32, i32*, i32)* @checktail to i32*
+ %res = tail call tailcc i32 @checktail( i32 10, i32* %f,i32 10)
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/tailccpic1.ll b/llvm/test/CodeGen/X86/tailccpic1.ll
new file mode 100644
index 00000000000..de8f2219bc2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccpic1.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s  -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
+
+; This test uses guaranteed TCO so these will be tail calls, despite the early
+; binding issues.
+
+define protected tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+entry:
+	ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) {
+entry:
+	%tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
+	ret i32 %tmp11
+; CHECK: jmp tailcallee
+}
diff --git a/llvm/test/CodeGen/X86/tailccpic2.ll b/llvm/test/CodeGen/X86/tailccpic2.ll
new file mode 100644
index 00000000000..314cd8f2fd6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccpic2.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s  -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
+
+define tailcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+entry:
+	ret i32 %a3
+}
+
+define tailcc i32 @tailcaller(i32 %in1, i32 %in2) {
+entry:
+	%tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
+	ret i32 %tmp11
+; CHECK: movl tailcallee@GOT
+; CHECK: jmpl
+}
+
diff --git a/llvm/test/CodeGen/X86/tailccstack64.ll b/llvm/test/CodeGen/X86/tailccstack64.ll
new file mode 100644
index 00000000000..bd0f4a73950
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailccstack64.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s
+
+; FIXME: Redundant unused stack allocation could be eliminated.
+; CHECK: subq  ${{24|72|80}}, %rsp
+
+; Check that lowered arguments on the stack do not overwrite each other.
+; Add %in1 %p1 to a different temporary register (%eax).
+; CHECK: movl  [[A1:32|144]](%rsp), [[R1:%e..]]
+; Move param %in1 to temp register (%r10d).
+; CHECK: movl  [[A2:40|152]](%rsp), [[R2:%[a-z0-9]+]]
+; Add %in1 %p1 to a different temporary register (%eax).
+; CHECK: addl {{%edi|%ecx}}, [[R1]]
+; Move param %in2 to stack.
+; CHECK-DAG: movl  [[R2]], [[A1]](%rsp)
+; Move result of addition to stack.
+; CHECK-DAG: movl  [[R1]], [[A2]](%rsp)
+; Eventually, do a TAILCALL
+; CHECK: TAILCALL
+
+declare tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %a, i32 %b) nounwind
+
+define tailcc i32 @tailcaller(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in1, i32 %in2) nounwind {
+entry:
+        %tmp = add i32 %in1, %p1
+        %retval = tail call tailcc i32 @tailcallee(i32 %p1, i32 %p2, i32 %p3, i32 %p4, i32 %p5, i32 %p6, i32 %in2,i32 %tmp)
+        ret i32 %retval
+}
diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim
index 14987cb2348..487a37b4b86 100644
--- a/llvm/utils/vim/syntax/llvm.vim
+++ b/llvm/utils/vim/syntax/llvm.vim
@@ -82,6 +82,7 @@ syn keyword llvmKeyword
       \ externally_initialized
       \ extern_weak
       \ fastcc
+      \ tailcc
       \ filter
       \ from
       \ gc