5 files changed, 349 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a2b11d55f65..edc47482576 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -426,6 +426,15 @@ def FeatureRetpolineExternalThunk
           "ourselves. Only has effect when combined with some other retpoline "
           "feature", [FeatureRetpolineIndirectCalls]>;
 
+// Mitigate LVI attacks against indirect calls/branches and call returns
+def FeatureLVIControlFlowIntegrity
+    : SubtargetFeature<
+          "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
+          "Prevent indirect calls/branches from using a memory operand, and "
+          "precede all indirect calls/branches from a register with an "
+          "LFENCE instruction to serialize control flow. Also decompose RET "
+          "instructions into a POP+LFENCE+JMP sequence.">;
+
 // Direct Move instructions.
 def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
                                        "Support movdiri instruction">;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dbcebd54f73..c8720d9ae3a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31417,6 +31417,11 @@ static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
     }
     llvm_unreachable("unexpected reg for retpoline");
   }
+
+  if (Subtarget.useLVIControlFlowIntegrity()) {
+    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+    return "__llvm_lvi_thunk_r11";
+  }
   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
 }
 
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
index e6408e986f1..36b9c3ccc95 100644
--- a/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -14,6 +14,8 @@
 ///
 /// Currently supported thunks include:
 /// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
+/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
+///   before making an indirect call/jump
 ///
 /// Note that the reason that this is implemented as a MachineFunctionPass and
 /// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
@@ -44,11 +46,14 @@ using namespace llvm;
 #define DEBUG_TYPE "x86-retpoline-thunks"
 
 static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
-static const char R11RetpolineName[]    = "__llvm_retpoline_r11";
-static const char EAXRetpolineName[]    = "__llvm_retpoline_eax";
-static const char ECXRetpolineName[]    = "__llvm_retpoline_ecx";
-static const char EDXRetpolineName[]    = "__llvm_retpoline_edx";
-static const char EDIRetpolineName[]    = "__llvm_retpoline_edi";
+static const char R11RetpolineName[] = "__llvm_retpoline_r11";
+static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
+static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
+static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
+static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
+
+static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
+static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
 
 namespace {
 template <typename Derived> class ThunkInserter {
@@ -80,6 +85,38 @@ struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
   void populateThunk(MachineFunction &MF);
 };
 
+struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
+  const char *getThunkPrefix() { return LVIThunkNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
+  }
+  void insertThunks(MachineModuleInfo &MMI) {
+    createThunkFunction(MMI, R11LVIThunkName);
+  }
+  void populateThunk(MachineFunction &MF) {
+    // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+    // generate two bbs for the entry block.
+    MachineBasicBlock *Entry = &MF.front();
+    Entry->clear();
+    while (MF.size() > 1)
+      MF.erase(std::next(MF.begin()));
+
+    // This code mitigates LVI by replacing each indirect call/jump with a
+    // direct call/jump to a thunk that looks like:
+    // ```
+    // lfence
+    // jmpq *%r11
+    // ```
+    // This ensures that if the value in register %r11 was loaded from memory,
+    // then the value in %r11 is (architecturally) correct prior to the jump.
+    const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
+    MF.front().addLiveIn(X86::R11);
+    return;
+  }
+};
+
 class X86IndirectThunks : public MachineFunctionPass {
 public:
   static char ID;
@@ -98,7 +135,7 @@ public:
   }
 
 private:
-  std::tuple<RetpolineThunkInserter> TIs;
+  std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
 
   // FIXME: When LLVM moves to C++17, these can become folds
   template <typename... ThunkInserterT>
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 35983797bd3..eb5c293e5cb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -421,6 +421,12 @@ protected:
   /// than emitting one inside the compiler.
   bool UseRetpolineExternalThunk = false;
 
+  /// Prevent generation of indirect call/branch instructions from memory,
+  /// and force all indirect call/branch instructions from a register to be
+  /// preceded by an LFENCE. Also decompose RET instructions into a
+  /// POP+LFENCE+JMP sequence.
+  bool UseLVIControlFlowIntegrity = false;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat = false;
 
@@ -711,13 +717,16 @@ public:
   // These are generic getters that OR together all of the thunk types
   // supported by the subtarget. Therefore useIndirectThunk*() will return true
   // if any respective thunk feature is enabled.
-  bool useIndirectThunkCalls() const { return useRetpolineIndirectCalls(); }
+  bool useIndirectThunkCalls() const {
+    return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
+  }
   bool useIndirectThunkBranches() const {
-    return useRetpolineIndirectBranches();
+    return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
   }
 
   bool preferMaskRegisters() const { return PreferMaskRegisters; }
   bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
+  bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
 
   unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
   unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll b/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll
new file mode 100644
index 00000000000..d2caf6e1e9e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/lvi-hardening-indirectbr.ll
@@ -0,0 +1,281 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -mattr=+lvi-cfi -O0 < %s | FileCheck %s --check-prefix=X64FAST
+;
+; Note that a lot of this code was lifted from retpoline.ll.
+
+declare void @bar(i32)
+
+; Test a simple indirect call and tail call.
+define void @icall_reg(void (i32)* %fp, i32 %x) {
+entry:
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  tail call void @bar(i32 %x)
+  tail call void %fp(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_reg:
+; X64-DAG:   movq %rdi, %[[fp:[^ ]*]]
+; X64-DAG:   movl %esi, %[[x:[^ ]*]]
+; X64:       movl %esi, %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       callq __llvm_lvi_thunk_r11
+; X64:       movl %[[x]], %edi
+; X64:       callq bar
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_reg:
+; X64FAST:       callq bar
+; X64FAST:       callq __llvm_lvi_thunk_r11
+; X64FAST:       callq bar
+; X64FAST:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+
+@global_fp = external global void (i32)*
+
+; Test an indirect call through a global variable.
+define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
+  %fp1 = load void (i32)*, void (i32)** @global_fp
+  call void %fp1(i32 %x)
+  %fp2 = load void (i32)*, void (i32)** @global_fp
+  tail call void %fp2(i32 %x)
+  ret void
+}
+
+; X64-LABEL: icall_global_fp:
+; X64-DAG:   movl %edi, %[[x:[^ ]*]]
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       callq __llvm_lvi_thunk_r11
+; X64-DAG:   movl %[[x]], %edi
+; X64-DAG:   movq global_fp(%rip), %r11
+; X64:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+; X64FAST-LABEL: icall_global_fp:
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       callq __llvm_lvi_thunk_r11
+; X64FAST:       movq global_fp(%rip), %r11
+; X64FAST:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+
+%struct.Foo = type { void (%struct.Foo*)** }
+
+; Test an indirect call through a vtable.
+define void @vcall(%struct.Foo* %obj) #0 {
+  %vptr_field = getelementptr %struct.Foo, %struct.Foo* %obj, i32 0, i32 0
+  %vptr = load void (%struct.Foo*)**, void (%struct.Foo*)*** %vptr_field
+  %vslot = getelementptr void(%struct.Foo*)*, void(%struct.Foo*)** %vptr, i32 1
+  %fp = load void(%struct.Foo*)*, void(%struct.Foo*)** %vslot
+  tail call void %fp(%struct.Foo* %obj)
+  tail call void %fp(%struct.Foo* %obj)
+  ret void
+}
+
+; X64-LABEL: vcall:
+; X64:       movq %rdi, %[[obj:[^ ]*]]
+; X64:       movq (%rdi), %[[vptr:[^ ]*]]
+; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
+; X64:       movq %[[fp]], %r11
+; X64:       callq __llvm_lvi_thunk_r11
+; X64-DAG:   movq %[[obj]], %rdi
+; X64-DAG:   movq %[[fp]], %r11
+; X64:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+; X64FAST-LABEL: vcall:
+; X64FAST:       callq __llvm_lvi_thunk_r11
+; X64FAST:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+
+declare void @direct_callee()
+
+define void @direct_tail() #0 {
+  tail call void @direct_callee()
+  ret void
+}
+
+; X64-LABEL: direct_tail:
+; X64:       jmp direct_callee # TAILCALL
+; X64FAST-LABEL: direct_tail:
+; X64FAST:   jmp direct_callee # TAILCALL
+
+
+declare void @nonlazybind_callee() #1
+
+define void @nonlazybind_caller() #0 {
+  call void @nonlazybind_callee()
+  tail call void @nonlazybind_callee()
+  ret void
+}
+
+; X64-LABEL: nonlazybind_caller:
+; X64:       movq nonlazybind_callee@GOTPCREL(%rip), %[[REG:.*]]
+; X64:       movq %[[REG]], %r11
+; X64:       callq __llvm_lvi_thunk_r11
+; X64:       movq %[[REG]], %r11
+; X64:       jmp __llvm_lvi_thunk_r11 # TAILCALL
+; X64FAST-LABEL: nonlazybind_caller:
+; X64FAST:   movq nonlazybind_callee@GOTPCREL(%rip), %r11
+; X64FAST:   callq __llvm_lvi_thunk_r11
+; X64FAST:   movq nonlazybind_callee@GOTPCREL(%rip), %r11
+; X64FAST:   jmp __llvm_lvi_thunk_r11 # TAILCALL
+
+
+; Check that a switch gets lowered using a jump table
+define void @switch_jumptable(i32* %ptr, i64* %sink) #0 {
+; X64-LABEL: switch_jumptable:
+; X64_NOT:      jmpq *
+entry:
+  br label %header
+
+header:
+  %i = load volatile i32, i32* %ptr
+  switch i32 %i, label %bb0 [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+    i32 5, label %bb5
+    i32 6, label %bb6
+    i32 7, label %bb7
+    i32 8, label %bb8
+    i32 9, label %bb9
+  ]
+
+bb0:
+  store volatile i64 0, i64* %sink
+  br label %header
+
+bb1:
+  store volatile i64 1, i64* %sink
+  br label %header
+
+bb2:
+  store volatile i64 2, i64* %sink
+  br label %header
+
+bb3:
+  store volatile i64 3, i64* %sink
+  br label %header
+
+bb4:
+  store volatile i64 4, i64* %sink
+  br label %header
+
+bb5:
+  store volatile i64 5, i64* %sink
+  br label %header
+
+bb6:
+  store volatile i64 6, i64* %sink
+  br label %header
+
+bb7:
+  store volatile i64 7, i64* %sink
+  br label %header
+
+bb8:
+  store volatile i64 8, i64* %sink
+  br label %header
+
+bb9:
+  store volatile i64 9, i64* %sink
+  br label %header
+}
+
+
+@indirectbr_rewrite.targets = constant [10 x i8*] [i8* blockaddress(@indirectbr_rewrite, %bb0),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb1),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb2),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb3),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb4),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb5),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb6),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb7),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb8),
+                                                   i8* blockaddress(@indirectbr_rewrite, %bb9)]
+
+; Check that when thunks are enabled the indirectbr instruction gets
+; rewritten to use switch, and that in turn doesn't get lowered as a jump
+; table.
+define void @indirectbr_rewrite(i64* readonly %p, i64* %sink) #0 {
+; X64-LABEL: indirectbr_rewrite:
+; X64-NOT:     jmpq *
+entry:
+  %i0 = load i64, i64* %p
+  %target.i0 = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i0
+  %target0 = load i8*, i8** %target.i0
+  indirectbr i8* %target0, [label %bb1, label %bb3]
+
+bb0:
+  store volatile i64 0, i64* %sink
+  br label %latch
+
+bb1:
+  store volatile i64 1, i64* %sink
+  br label %latch
+
+bb2:
+  store volatile i64 2, i64* %sink
+  br label %latch
+
+bb3:
+  store volatile i64 3, i64* %sink
+  br label %latch
+
+bb4:
+  store volatile i64 4, i64* %sink
+  br label %latch
+
+bb5:
+  store volatile i64 5, i64* %sink
+  br label %latch
+
+bb6:
+  store volatile i64 6, i64* %sink
+  br label %latch
+
+bb7:
+  store volatile i64 7, i64* %sink
+  br label %latch
+
+bb8:
+  store volatile i64 8, i64* %sink
+  br label %latch
+
+bb9:
+  store volatile i64 9, i64* %sink
+  br label %latch
+
+latch:
+  %i.next = load i64, i64* %p
+  %target.i.next = getelementptr [10 x i8*], [10 x i8*]* @indirectbr_rewrite.targets, i64 0, i64 %i.next
+  %target.next = load i8*, i8** %target.i.next
+  ; Potentially hit a full 10 successors here so that even if we rewrite as
+  ; a switch it will try to be lowered with a jump table.
+  indirectbr i8* %target.next, [label %bb0,
+                                label %bb1,
+                                label %bb2,
+                                label %bb3,
+                                label %bb4,
+                                label %bb5,
+                                label %bb6,
+                                label %bb7,
+                                label %bb8,
+                                label %bb9]
+}
+
+; Lastly check that the necessary thunks were emitted.
+;
+; X64-LABEL:         .section        .text.__llvm_lvi_thunk_r11,{{.*}},__llvm_lvi_thunk_r11,comdat
+; X64-NEXT:          .hidden __llvm_lvi_thunk_r11
+; X64-NEXT:          .weak   __llvm_lvi_thunk_r11
+; X64:       __llvm_lvi_thunk_r11:
+; X64-NEXT:  # {{.*}}                                # %entry
+; X64-NEXT:          lfence
+; X64-NEXT:          jmpq     *%r11
+
+attributes #1 = { nonlazybind }