2 files changed, 686 insertions, 24 deletions
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 81bd9014a88..923d82f051f 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -179,6 +179,9 @@ private:
 
   void unfoldCallAndJumpLoads(MachineFunction &MF);
 
+  SmallVector<MachineInstr *, 16>
+  tracePredStateThroughIndirectBranches(MachineFunction &MF);
+
   void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
 
   unsigned saveEFLAGS(MachineBasicBlock &MBB,
@@ -522,11 +525,16 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
     }
   }
 
-  // If we are going to harden calls and jumps we need to unfold their memory
-  // operands.
-  if (HardenIndirectCallsAndJumps)
+  if (HardenIndirectCallsAndJumps) {
+    // If we are going to harden calls and jumps we need to unfold their memory
+    // operands.
     unfoldCallAndJumpLoads(MF);
 
+    // Then we trace predicate state through the indirect branches.
+    auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
+    CMovs.append(IndirectBrCMovs.begin(), IndirectBrCMovs.end());
+  }
+
   // Now that we have the predicate state available at the start of each block
   // in the CFG, trace it through each block, hardening vulnerable instructions
   // as we go.
@@ -925,6 +933,263 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
     }
 }
 
+/// Trace the predicate state through indirect branches, instrumenting them to
+/// poison the state if a target is reached that does not match the expected
+/// target.
+///
+/// This is designed to mitigate Spectre variant 1 attacks where an indirect
+/// branch is trained to predict a particular target and then mispredicts that
+/// target in a way that can leak data. Despite using an indirect branch, this
+/// is really a variant 1 style attack: it does not steer execution to an
+/// arbitrary or attacker controlled address, and it does not require any
+/// special code executing next to the victim. This attack can also be mitigated
+/// through retpolines, but those require either replacing indirect branches
+/// with conditional direct branches or lowering them through a device that
+/// blocks speculation. This mitigation can replace these retpoline-style
+/// mitigations for jump tables and other indirect branches within a function
+/// when variant 2 isn't a risk while allowing limited speculation. Indirect
+/// calls, however, cannot be mitigated through this technique without changing
+/// the ABI in a fundamental way.
+SmallVector<MachineInstr *, 16>
+X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
+    MachineFunction &MF) {
+  // We use the SSAUpdater to insert PHI nodes for the target addresses of
+  // indirect branches. We don't actually need the full power of the SSA updater
+  // in this particular case as we always have immediately available values, but
+  // this avoids us having to re-implement the PHI construction logic.
+  MachineSSAUpdater TargetAddrSSA(MF);
+  TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
+
+  // Track which blocks were terminated with an indirect branch.
+  SmallPtrSet<MachineBasicBlock *, 4> IndirectTerminatedMBBs;
+
+  // We need to know what blocks end up reached via indirect branches. We
+  // expect this to be a subset of those whose address is taken and so track it
+  // directly via the CFG.
+  SmallPtrSet<MachineBasicBlock *, 4> IndirectTargetMBBs;
+
+  // Walk all the blocks which end in an indirect branch and make the
+  // target address available.
+  for (MachineBasicBlock &MBB : MF) {
+    // Find the last terminator.
+    auto MII = MBB.instr_rbegin();
+    while (MII != MBB.instr_rend() && MII->isDebugInstr())
+      ++MII;
+    if (MII == MBB.instr_rend())
+      continue;
+    MachineInstr &TI = *MII;
+    if (!TI.isTerminator() || !TI.isBranch())
+      // No terminator or non-branch terminator.
+      continue;
+
+    unsigned TargetReg;
+
+    switch (TI.getOpcode()) {
+    default:
+      // Direct branch or conditional branch (leading to fallthrough).
+      continue;
+
+    case X86::FARJMP16m:
+    case X86::FARJMP32m:
+    case X86::FARJMP64:
+      // We cannot mitigate far jumps or calls, but we also don't expect them
+      // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
+      continue;
+
+    case X86::JMP16m:
+    case X86::JMP16m_NT:
+    case X86::JMP32m:
+    case X86::JMP32m_NT:
+    case X86::JMP64m:
+    case X86::JMP64m_NT:
+      // Mostly as documentation.
+      report_fatal_error("Memory operand jumps should have been unfolded!");
+
+    case X86::JMP16r:
+      report_fatal_error(
+          "Support for 16-bit indirect branches is not implemented.");
+    case X86::JMP32r:
+      report_fatal_error(
+          "Support for 32-bit indirect branches is not implemented.");
+
+    case X86::JMP64r:
+      TargetReg = TI.getOperand(0).getReg();
+    }
+
+    // We have definitely found an indirect  branch. Verify that there are no
+    // preceding conditional branches as we don't yet support that.
+    if (llvm::any_of(MBB.terminators(), [&](MachineInstr &OtherTI) {
+          return !OtherTI.isDebugInstr() && &OtherTI != &TI;
+        })) {
+      LLVM_DEBUG({
+        dbgs() << "ERROR: Found other terminators in a block with an indirect "
+                  "branch! This is not yet supported! Terminator sequence:\n";
+        for (MachineInstr &MI : MBB.terminators()) {
+          MI.dump();
+          dbgs() << '\n';
+        }
+      });
+      report_fatal_error("Unimplemented terminator sequence!");
+    }
+
+    // Make the target register an available value for this block.
+    TargetAddrSSA.AddAvailableValue(&MBB, TargetReg);
+    IndirectTerminatedMBBs.insert(&MBB);
+
+    // Add all the successors to our target candidates.
+    for (MachineBasicBlock *Succ : MBB.successors())
+      IndirectTargetMBBs.insert(Succ);
+  }
+
+  // Keep track of the cmov instructions we insert so we can return them.
+  SmallVector<MachineInstr *, 16> CMovs;
+
+  // If we didn't find any indirect branches with targets, nothing to do here.
+  if (IndirectTargetMBBs.empty())
+    return CMovs;
+
+  // We found indirect branches and targets that need to be instrumented to
+  // harden loads within them. Walk the blocks of the function (to get a stable
+  // ordering) and instrument each target of an indirect branch.
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip the blocks that aren't candidate targets.
+    if (!IndirectTargetMBBs.count(&MBB))
+      continue;
+
+    // We don't expect EH pads to ever be reached via an indirect branch. If
+    // this is desired for some reason, we could simply skip them here rather
+    // than asserting.
+    assert(!MBB.isEHPad() &&
+           "Unexpected EH pad as target of an indirect branch!");
+
+    // We should never end up threading EFLAGS into a block to harden
+    // conditional jumps as there would be an additional successor via the
+    // indirect branch. As a consequence, all such edges would be split before
+    // reaching here, and the inserted block will handle the EFLAGS-based
+    // hardening.
+    assert(!MBB.isLiveIn(X86::EFLAGS) &&
+           "Cannot check within a block that already has live-in EFLAGS!");
+
+    // We can't handle having non-indirect edges into this block unless this is
+    // the only successor and we can synthesize the necessary target address.
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      // If we've already handled this by extracting the target directly,
+      // nothing to do.
+      if (IndirectTerminatedMBBs.count(Pred))
+        continue;
+
+      // Otherwise, we have to be the only successor. We generally expect this
+      // to be true as conditional branches should have had a critical edge
+      // split already. We don't however need to worry about EH pad successors
+      // as they'll happily ignore the target and their hardening strategy is
+      // resilient to all ways in which they could be reached speculatively.
+      if (!llvm::all_of(Pred->successors(), [&](MachineBasicBlock *Succ) {
+            return Succ->isEHPad() || Succ == &MBB;
+          })) {
+        LLVM_DEBUG({
+          dbgs() << "ERROR: Found conditional entry to target of indirect "
+                    "branch!\n";
+          Pred->dump();
+          MBB.dump();
+        });
+        report_fatal_error("Cannot harden a conditional entry to a target of "
+                           "an indirect branch!");
+      }
+
+      // Now we need to compute the address of this block and install it as a
+      // synthetic target in the predecessor. We do this at the bottom of the
+      // predecessor.
+      auto InsertPt = Pred->getFirstTerminator();
+      unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+      if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+          !Subtarget->isPositionIndependent()) {
+        // Directly materialize it into an immediate.
+        auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
+                             TII->get(X86::MOV64ri32), TargetReg)
+                         .addMBB(&MBB);
+        ++NumInstsInserted;
+        (void)AddrI;
+        LLVM_DEBUG(dbgs() << "  Inserting mov: "; AddrI->dump();
+                   dbgs() << "\n");
+      } else {
+        auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
+                             TargetReg)
+                         .addReg(/*Base*/ X86::RIP)
+                         .addImm(/*Scale*/ 1)
+                         .addReg(/*Index*/ 0)
+                         .addMBB(&MBB)
+                         .addReg(/*Segment*/ 0);
+        ++NumInstsInserted;
+        (void)AddrI;
+        LLVM_DEBUG(dbgs() << "  Inserting lea: "; AddrI->dump();
+                   dbgs() << "\n");
+      }
+      // And make this available.
+      TargetAddrSSA.AddAvailableValue(Pred, TargetReg);
+    }
+
+    // Materialize the needed SSA value of the target. Note that we need the
+    // middle of the block as this block might at the bottom have an indirect
+    // branch back to itself. We can do this here because at this point, every
+    // predecessor of this block has an available value. This is basically just
+    // automating the construction of a PHI node for this target.
+    unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
+
+    // Insert a comparison of the incoming target register with this block's
+    // address.
+    auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
+    if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+        !Subtarget->isPositionIndependent()) {
+      // Check directly against a relocated immediate when we can.
+      auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
+                        .addReg(TargetReg, RegState::Kill)
+                        .addMBB(&MBB);
+      ++NumInstsInserted;
+      (void)CheckI;
+      LLVM_DEBUG(dbgs() << "  Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
+    } else {
+      // Otherwise compute the address into a register first.
+      unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+      auto AddrI =
+          BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
+              .addReg(/*Base*/ X86::RIP)
+              .addImm(/*Scale*/ 1)
+              .addReg(/*Index*/ 0)
+              .addMBB(&MBB)
+              .addReg(/*Segment*/ 0);
+      ++NumInstsInserted;
+      (void)AddrI;
+      LLVM_DEBUG(dbgs() << "  Inserting lea: "; AddrI->dump(); dbgs() << "\n");
+      auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
+                        .addReg(TargetReg, RegState::Kill)
+                        .addReg(AddrReg, RegState::Kill);
+      ++NumInstsInserted;
+      (void)CheckI;
+      LLVM_DEBUG(dbgs() << "  Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
+    }
+
+    // Now cmov over the predicate if the comparison wasn't equal.
+    int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+    auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+    unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+    auto CMovI =
+        BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
+            .addReg(PS->InitialReg)
+            .addReg(PS->PoisonReg);
+    CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+    ++NumInstsInserted;
+    LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
+    CMovs.push_back(&*CMovI);
+
+    // And put the new value into the available values for SSA form of our
+    // predicate state.
+    PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
+  }
+
+  // Return all the newly inserted cmov instructions of the predicate state.
+  return CMovs;
+}
+
 /// Returns true if the instruction has no behavior (specified or otherwise)
 /// that is based on the value of any of its register operands
 ///
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index be8db624572..a94dc9219e3 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections | FileCheck %s --check-prefix=X64-PIC
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline | FileCheck %s --check-prefix=X64-RETPOLINE
 ;
 ; FIXME: Add support for 32-bit.
@@ -32,6 +33,24 @@ define i32 @test_indirect_call(i32 ()** %ptr) nounwind {
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
 ;
+; X64-PIC-LABEL: test_indirect_call:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    pushq %rax
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq $-1, %rcx
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    movq (%rdi), %rcx
+; X64-PIC-NEXT:    orq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    callq *%rcx
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    popq %rcx
+; X64-PIC-NEXT:    retq
+;
 ; X64-RETPOLINE-LABEL: test_indirect_call:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    pushq %rax
@@ -67,6 +86,17 @@ define i32 @test_indirect_tail_call(i32 ()** %ptr) nounwind {
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    jmpq *%rcx # TAILCALL
 ;
+; X64-PIC-LABEL: test_indirect_tail_call:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq $-1, %rcx
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    movq (%rdi), %rcx
+; X64-PIC-NEXT:    orq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    jmpq *%rcx # TAILCALL
+;
 ; X64-RETPOLINE-LABEL: test_indirect_tail_call:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rax
@@ -90,7 +120,7 @@ define i32 @test_indirect_call_global() nounwind {
 ; X64-NEXT:    movq %rsp, %rax
 ; X64-NEXT:    movq $-1, %rcx
 ; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq {{.*}}(%rip), %rcx
+; X64-NEXT:    movq global_fnptr(%rip), %rcx
 ; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
@@ -102,13 +132,32 @@ define i32 @test_indirect_call_global() nounwind {
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
 ;
+; X64-PIC-LABEL: test_indirect_call_global:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    pushq %rax
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq $-1, %rcx
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    movq global_fnptr@GOTPCREL(%rip), %rcx
+; X64-PIC-NEXT:    movq (%rcx), %rcx
+; X64-PIC-NEXT:    orq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    callq *%rcx
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    popq %rcx
+; X64-PIC-NEXT:    retq
+;
 ; X64-RETPOLINE-LABEL: test_indirect_call_global:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    pushq %rax
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rax
 ; X64-RETPOLINE-NEXT:    movq $-1, %rcx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rax
-; X64-RETPOLINE-NEXT:    movq {{.*}}(%rip), %r11
+; X64-RETPOLINE-NEXT:    movq global_fnptr(%rip), %r11
 ; X64-RETPOLINE-NEXT:    shlq $47, %rax
 ; X64-RETPOLINE-NEXT:    orq %rax, %rsp
 ; X64-RETPOLINE-NEXT:    callq __llvm_retpoline_r11
@@ -130,18 +179,30 @@ define i32 @test_indirect_tail_call_global() nounwind {
 ; X64-NEXT:    movq %rsp, %rax
 ; X64-NEXT:    movq $-1, %rcx
 ; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq {{.*}}(%rip), %rcx
+; X64-NEXT:    movq global_fnptr(%rip), %rcx
 ; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    jmpq *%rcx # TAILCALL
 ;
+; X64-PIC-LABEL: test_indirect_tail_call_global:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq $-1, %rcx
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    movq global_fnptr@GOTPCREL(%rip), %rcx
+; X64-PIC-NEXT:    movq (%rcx), %rcx
+; X64-PIC-NEXT:    orq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    jmpq *%rcx # TAILCALL
+;
 ; X64-RETPOLINE-LABEL: test_indirect_tail_call_global:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rax
 ; X64-RETPOLINE-NEXT:    movq $-1, %rcx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rax
-; X64-RETPOLINE-NEXT:    movq {{.*}}(%rip), %r11
+; X64-RETPOLINE-NEXT:    movq global_fnptr(%rip), %r11
 ; X64-RETPOLINE-NEXT:    shlq $47, %rax
 ; X64-RETPOLINE-NEXT:    orq %rax, %rsp
 ; X64-RETPOLINE-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
@@ -157,25 +218,69 @@ define i32 @test_indirectbr(i8** %ptr) nounwind {
 ; X64-NEXT:    movq %rsp, %rcx
 ; X64-NEXT:    movq $-1, %rax
 ; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    jmpq *%rax
+; X64-NEXT:    movq (%rdi), %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    jmpq *%rdx
 ; X64-NEXT:  .LBB4_1: # %bb0
+; X64-NEXT:    cmpq $.LBB4_1, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $2, %eax
 ; X64-NEXT:    jmp .LBB4_2
 ; X64-NEXT:  .LBB4_4: # %bb2
+; X64-NEXT:    cmpq $.LBB4_4, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $13, %eax
 ; X64-NEXT:    jmp .LBB4_2
 ; X64-NEXT:  .LBB4_5: # %bb3
+; X64-NEXT:    cmpq $.LBB4_5, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    jmp .LBB4_2
 ; X64-NEXT:  .LBB4_3: # %bb1
+; X64-NEXT:    cmpq $.LBB4_3, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:  .LBB4_2: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
+; X64-PIC-LABEL: test_indirectbr:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    movq $-1, %rax
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    movq (%rdi), %rdx
+; X64-PIC-NEXT:    orq %rcx, %rdx
+; X64-PIC-NEXT:    jmpq *%rdx
+; X64-PIC-NEXT:  .LBB4_1: # %bb0
+; X64-PIC-NEXT:    leaq .LBB4_1(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $2, %eax
+; X64-PIC-NEXT:    jmp .LBB4_2
+; X64-PIC-NEXT:  .LBB4_4: # %bb2
+; X64-PIC-NEXT:    leaq .LBB4_4(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $13, %eax
+; X64-PIC-NEXT:    jmp .LBB4_2
+; X64-PIC-NEXT:  .LBB4_5: # %bb3
+; X64-PIC-NEXT:    leaq .LBB4_5(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $42, %eax
+; X64-PIC-NEXT:    jmp .LBB4_2
+; X64-PIC-NEXT:  .LBB4_3: # %bb1
+; X64-PIC-NEXT:    leaq .LBB4_3(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:  .LBB4_2: # %bb0
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+;
 ; X64-RETPOLINE-LABEL: test_indirectbr:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 entry:
@@ -201,30 +306,80 @@ define i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-NEXT:    movq %rsp, %rcx
 ; X64-NEXT:    movq $-1, %rax
 ; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movslq %edi, %rax
-; X64-NEXT:    movq global_blockaddrs(,%rax,8), %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    jmpq *%rax
+; X64-NEXT:    movslq %edi, %rdx
+; X64-NEXT:    movq global_blockaddrs(,%rdx,8), %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    jmpq *%rdx
 ; X64-NEXT:  .Ltmp0: # Block address taken
 ; X64-NEXT:  .LBB5_1: # %bb0
+; X64-NEXT:    cmpq $.LBB5_1, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $2, %eax
 ; X64-NEXT:    jmp .LBB5_2
 ; X64-NEXT:  .Ltmp1: # Block address taken
 ; X64-NEXT:  .LBB5_4: # %bb2
+; X64-NEXT:    cmpq $.LBB5_4, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $13, %eax
 ; X64-NEXT:    jmp .LBB5_2
 ; X64-NEXT:  .Ltmp2: # Block address taken
 ; X64-NEXT:  .LBB5_5: # %bb3
+; X64-NEXT:    cmpq $.LBB5_5, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    jmp .LBB5_2
 ; X64-NEXT:  .Ltmp3: # Block address taken
 ; X64-NEXT:  .LBB5_3: # %bb1
+; X64-NEXT:    cmpq $.LBB5_3, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:  .LBB5_2: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
+; X64-PIC-LABEL: test_indirectbr_global:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    movq $-1, %rax
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    movslq %edi, %rdx
+; X64-PIC-NEXT:    movq global_blockaddrs@GOTPCREL(%rip), %rsi
+; X64-PIC-NEXT:    movq (%rsi,%rdx,8), %rdx
+; X64-PIC-NEXT:    orq %rcx, %rdx
+; X64-PIC-NEXT:    jmpq *%rdx
+; X64-PIC-NEXT:  .Ltmp0: # Block address taken
+; X64-PIC-NEXT:  .LBB5_1: # %bb0
+; X64-PIC-NEXT:    leaq .LBB5_1(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $2, %eax
+; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:  .Ltmp1: # Block address taken
+; X64-PIC-NEXT:  .LBB5_4: # %bb2
+; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $13, %eax
+; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:  .Ltmp2: # Block address taken
+; X64-PIC-NEXT:  .LBB5_5: # %bb3
+; X64-PIC-NEXT:    leaq .LBB5_5(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $42, %eax
+; X64-PIC-NEXT:    jmp .LBB5_2
+; X64-PIC-NEXT:  .Ltmp3: # Block address taken
+; X64-PIC-NEXT:  .LBB5_3: # %bb1
+; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:  .LBB5_2: # %bb0
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+;
 ; X64-RETPOLINE-LABEL: test_indirectbr_global:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rcx
@@ -296,30 +451,85 @@ define i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    ja .LBB6_2
 ; X64-NEXT:  # %bb.1: # %entry
 ; X64-NEXT:    cmovaq %rax, %rcx
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movq .LJTI6_0(,%rax,8), %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    jmpq *%rax
-; X64-NEXT:  .LBB6_3: # %bb1
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    movq .LJTI6_0(,%rdx,8), %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    jmpq *%rdx
+; X64-NEXT:  .LBB6_4: # %bb1
+; X64-NEXT:    cmpq $.LBB6_4, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    jmp .LBB6_4
+; X64-NEXT:    jmp .LBB6_3
 ; X64-NEXT:  .LBB6_2: # %bb0
 ; X64-NEXT:    cmovbeq %rax, %rcx
 ; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    jmp .LBB6_4
+; X64-NEXT:    jmp .LBB6_3
 ; X64-NEXT:  .LBB6_5: # %bb2
+; X64-NEXT:    cmpq $.LBB6_5, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $13, %eax
-; X64-NEXT:    jmp .LBB6_4
+; X64-NEXT:    jmp .LBB6_3
 ; X64-NEXT:  .LBB6_6: # %bb3
+; X64-NEXT:    cmpq $.LBB6_6, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $42, %eax
-; X64-NEXT:    jmp .LBB6_4
+; X64-NEXT:    jmp .LBB6_3
 ; X64-NEXT:  .LBB6_7: # %bb5
+; X64-NEXT:    cmpq $.LBB6_7, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    movl $11, %eax
-; X64-NEXT:  .LBB6_4: # %bb1
+; X64-NEXT:  .LBB6_3: # %bb0
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
+; X64-PIC-LABEL: test_switch_jumptable:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    movq $-1, %rax
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    cmpl $3, %edi
+; X64-PIC-NEXT:    ja .LBB6_2
+; X64-PIC-NEXT:  # %bb.1: # %entry
+; X64-PIC-NEXT:    cmovaq %rax, %rcx
+; X64-PIC-NEXT:    movl %edi, %edx
+; X64-PIC-NEXT:    leaq .LJTI6_0(%rip), %rsi
+; X64-PIC-NEXT:    movslq (%rsi,%rdx,4), %rdx
+; X64-PIC-NEXT:    addq %rsi, %rdx
+; X64-PIC-NEXT:    orq %rcx, %rdx
+; X64-PIC-NEXT:    jmpq *%rdx
+; X64-PIC-NEXT:  .LBB6_4: # %bb1
+; X64-PIC-NEXT:    leaq .LBB6_4(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:    jmp .LBB6_3
+; X64-PIC-NEXT:  .LBB6_2: # %bb0
+; X64-PIC-NEXT:    cmovbeq %rax, %rcx
+; X64-PIC-NEXT:    movl $2, %eax
+; X64-PIC-NEXT:    jmp .LBB6_3
+; X64-PIC-NEXT:  .LBB6_5: # %bb2
+; X64-PIC-NEXT:    leaq .LBB6_5(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $13, %eax
+; X64-PIC-NEXT:    jmp .LBB6_3
+; X64-PIC-NEXT:  .LBB6_6: # %bb3
+; X64-PIC-NEXT:    leaq .LBB6_6(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $42, %eax
+; X64-PIC-NEXT:    jmp .LBB6_3
+; X64-PIC-NEXT:  .LBB6_7: # %bb5
+; X64-PIC-NEXT:    leaq .LBB6_7(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    movl $11, %eax
+; X64-PIC-NEXT:  .LBB6_3: # %bb0
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
+;
 ; X64-RETPOLINE-LABEL: test_switch_jumptable:
 ; X64-RETPOLINE:       # %bb.0: # %entry
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rcx
@@ -389,3 +599,190 @@ bb3:
 bb5:
   ret i32 11
 }
+
+; This function's switch is crafted to trigger jump-table lowering in the x86
+; backend so that we can test how the exact jump table lowering behaves, but
+; also arranges for fallthroughs from case to case to ensure that this pattern
+; too can be handled.
+define i32 @test_switch_jumptable_fallthrough(i32 %idx, i32* %a.ptr, i32* %b.ptr, i32* %c.ptr, i32* %d.ptr) nounwind {
+; X64-LABEL: test_switch_jumptable_fallthrough:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rsp, %r9
+; X64-NEXT:    movq $-1, %r10
+; X64-NEXT:    sarq $63, %r9
+; X64-NEXT:    cmpl $3, %edi
+; X64-NEXT:    ja .LBB7_2
+; X64-NEXT:  # %bb.1: # %entry
+; X64-NEXT:    cmovaq %r10, %r9
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %edi, %esi
+; X64-NEXT:    movq .LJTI7_0(,%rsi,8), %rsi
+; X64-NEXT:    orq %r9, %rsi
+; X64-NEXT:    jmpq *%rsi
+; X64-NEXT:  .LBB7_2: # %bb0
+; X64-NEXT:    cmovbeq %r10, %r9
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    orl %r9d, %eax
+; X64-NEXT:    movq $.LBB7_3, %rsi
+; X64-NEXT:  .LBB7_3: # %bb1
+; X64-NEXT:    cmpq $.LBB7_3, %rsi
+; X64-NEXT:    cmovneq %r10, %r9
+; X64-NEXT:    addl (%rdx), %eax
+; X64-NEXT:    orl %r9d, %eax
+; X64-NEXT:    movq $.LBB7_4, %rsi
+; X64-NEXT:  .LBB7_4: # %bb2
+; X64-NEXT:    cmpq $.LBB7_4, %rsi
+; X64-NEXT:    cmovneq %r10, %r9
+; X64-NEXT:    addl (%rcx), %eax
+; X64-NEXT:    orl %r9d, %eax
+; X64-NEXT:    movq $.LBB7_5, %rsi
+; X64-NEXT:  .LBB7_5: # %bb3
+; X64-NEXT:    cmpq $.LBB7_5, %rsi
+; X64-NEXT:    cmovneq %r10, %r9
+; X64-NEXT:    addl (%r8), %eax
+; X64-NEXT:    orl %r9d, %eax
+; X64-NEXT:    movq $.LBB7_6, %rsi
+; X64-NEXT:  .LBB7_6: # %bb4
+; X64-NEXT:    cmpq $.LBB7_6, %rsi
+; X64-NEXT:    cmovneq %r10, %r9
+; X64-NEXT:    shlq $47, %r9
+; X64-NEXT:    orq %r9, %rsp
+; X64-NEXT:    retq
+;
+; X64-PIC-LABEL: test_switch_jumptable_fallthrough:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    movq %rsp, %r9
+; X64-PIC-NEXT:    movq $-1, %r10
+; X64-PIC-NEXT:    sarq $63, %r9
+; X64-PIC-NEXT:    cmpl $3, %edi
+; X64-PIC-NEXT:    ja .LBB7_2
+; X64-PIC-NEXT:  # %bb.1: # %entry
+; X64-PIC-NEXT:    cmovaq %r10, %r9
+; X64-PIC-NEXT:    xorl %eax, %eax
+; X64-PIC-NEXT:    movl %edi, %esi
+; X64-PIC-NEXT:    leaq .LJTI7_0(%rip), %rdi
+; X64-PIC-NEXT:    movslq (%rdi,%rsi,4), %rsi
+; X64-PIC-NEXT:    addq %rdi, %rsi
+; X64-PIC-NEXT:    orq %r9, %rsi
+; X64-PIC-NEXT:    jmpq *%rsi
+; X64-PIC-NEXT:  .LBB7_2: # %bb0
+; X64-PIC-NEXT:    cmovbeq %r10, %r9
+; X64-PIC-NEXT:    movl (%rsi), %eax
+; X64-PIC-NEXT:    orl %r9d, %eax
+; X64-PIC-NEXT:    leaq .LBB7_3(%rip), %rsi
+; X64-PIC-NEXT:  .LBB7_3: # %bb1
+; X64-PIC-NEXT:    leaq .LBB7_3(%rip), %rdi
+; X64-PIC-NEXT:    cmpq %rdi, %rsi
+; X64-PIC-NEXT:    cmovneq %r10, %r9
+; X64-PIC-NEXT:    addl (%rdx), %eax
+; X64-PIC-NEXT:    orl %r9d, %eax
+; X64-PIC-NEXT:    leaq .LBB7_4(%rip), %rsi
+; X64-PIC-NEXT:  .LBB7_4: # %bb2
+; X64-PIC-NEXT:    leaq .LBB7_4(%rip), %rdx
+; X64-PIC-NEXT:    cmpq %rdx, %rsi
+; X64-PIC-NEXT:    cmovneq %r10, %r9
+; X64-PIC-NEXT:    addl (%rcx), %eax
+; X64-PIC-NEXT:    orl %r9d, %eax
+; X64-PIC-NEXT:    leaq .LBB7_5(%rip), %rsi
+; X64-PIC-NEXT:  .LBB7_5: # %bb3
+; X64-PIC-NEXT:    leaq .LBB7_5(%rip), %rcx
+; X64-PIC-NEXT:    cmpq %rcx, %rsi
+; X64-PIC-NEXT:    cmovneq %r10, %r9
+; X64-PIC-NEXT:    addl (%r8), %eax
+; X64-PIC-NEXT:    orl %r9d, %eax
+; X64-PIC-NEXT:    leaq .LBB7_6(%rip), %rsi
+; X64-PIC-NEXT:  .LBB7_6: # %bb4
+; X64-PIC-NEXT:    leaq .LBB7_6(%rip), %rcx
+; X64-PIC-NEXT:    cmpq %rcx, %rsi
+; X64-PIC-NEXT:    cmovneq %r10, %r9
+; X64-PIC-NEXT:    shlq $47, %r9
+; X64-PIC-NEXT:    orq %r9, %rsp
+; X64-PIC-NEXT:    retq
+;
+; X64-RETPOLINE-LABEL: test_switch_jumptable_fallthrough:
+; X64-RETPOLINE:       # %bb.0: # %entry
+; X64-RETPOLINE-NEXT:    movq %rsp, %r9
+; X64-RETPOLINE-NEXT:    movq $-1, %r10
+; X64-RETPOLINE-NEXT:    sarq $63, %r9
+; X64-RETPOLINE-NEXT:    xorl %eax, %eax
+; X64-RETPOLINE-NEXT:    cmpl $1, %edi
+; X64-RETPOLINE-NEXT:    jg .LBB8_5
+; X64-RETPOLINE-NEXT:  # %bb.1: # %entry
+; X64-RETPOLINE-NEXT:    cmovgq %r10, %r9
+; X64-RETPOLINE-NEXT:    testl %edi, %edi
+; X64-RETPOLINE-NEXT:    je .LBB8_2
+; X64-RETPOLINE-NEXT:  # %bb.3: # %entry
+; X64-RETPOLINE-NEXT:    cmoveq %r10, %r9
+; X64-RETPOLINE-NEXT:    cmpl $1, %edi
+; X64-RETPOLINE-NEXT:    jne .LBB8_8
+; X64-RETPOLINE-NEXT:  # %bb.4:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_10
+; X64-RETPOLINE-NEXT:  .LBB8_5: # %entry
+; X64-RETPOLINE-NEXT:    cmovleq %r10, %r9
+; X64-RETPOLINE-NEXT:    cmpl $2, %edi
+; X64-RETPOLINE-NEXT:    je .LBB8_6
+; X64-RETPOLINE-NEXT:  # %bb.7: # %entry
+; X64-RETPOLINE-NEXT:    cmoveq %r10, %r9
+; X64-RETPOLINE-NEXT:    cmpl $3, %edi
+; X64-RETPOLINE-NEXT:    jne .LBB8_8
+; X64-RETPOLINE-NEXT:  # %bb.13:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_12
+; X64-RETPOLINE-NEXT:  .LBB8_8:
+; X64-RETPOLINE-NEXT:    cmoveq %r10, %r9
+; X64-RETPOLINE-NEXT:    movl (%rsi), %eax
+; X64-RETPOLINE-NEXT:    orl %r9d, %eax
+; X64-RETPOLINE-NEXT:    jmp .LBB8_9
+; X64-RETPOLINE-NEXT:  .LBB8_2:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:  .LBB8_9: # %bb1
+; X64-RETPOLINE-NEXT:    addl (%rdx), %eax
+; X64-RETPOLINE-NEXT:    orl %r9d, %eax
+; X64-RETPOLINE-NEXT:  .LBB8_10: # %bb2
+; X64-RETPOLINE-NEXT:    addl (%rcx), %eax
+; X64-RETPOLINE-NEXT:    orl %r9d, %eax
+; X64-RETPOLINE-NEXT:    jmp .LBB8_11
+; X64-RETPOLINE-NEXT:  .LBB8_6:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:  .LBB8_11: # %bb3
+; X64-RETPOLINE-NEXT:    addl (%r8), %eax
+; X64-RETPOLINE-NEXT:    orl %r9d, %eax
+; X64-RETPOLINE-NEXT:  .LBB8_12: # %bb4
+; X64-RETPOLINE-NEXT:    shlq $47, %r9
+; X64-RETPOLINE-NEXT:    orq %r9, %rsp
+; X64-RETPOLINE-NEXT:    retq
+entry:
+  switch i32 %idx, label %bb0 [
+    i32 0, label %bb1
+    i32 1, label %bb2
+    i32 2, label %bb3
+    i32 3, label %bb4
+  ]
+
+bb0:
+  %a = load i32, i32* %a.ptr
+  br label %bb1
+
+bb1:
+  %b.phi = phi i32 [ 0, %entry ], [ %a, %bb0 ]
+  %b = load i32, i32* %b.ptr
+  %b.sum = add i32 %b.phi, %b
+  br label %bb2
+
+bb2:
+  %c.phi = phi i32 [ 0, %entry ], [ %b.sum, %bb1 ]
+  %c = load i32, i32* %c.ptr
+  %c.sum = add i32 %c.phi, %c
+  br label %bb3
+
+bb3:
+  %d.phi = phi i32 [ 0, %entry ], [ %c.sum, %bb2 ]
+  %d = load i32, i32* %d.ptr
+  %d.sum = add i32 %d.phi, %d
+  br label %bb4
+
+bb4:
+  %e.phi = phi i32 [ 0, %entry ], [ %d.sum, %bb3 ]
+  ret i32 %e.phi
+}