14 files changed, 789 insertions, 9 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 2f0d0bf346d..c36d9354f3b 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -39,6 +39,7 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
                                  CodeGenOpt::Level OptLevel);
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
 ModulePass *createAArch64PromoteConstantPass();
@@ -68,6 +69,7 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 7a7b0dd20a4..47550cabb9f 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -2258,6 +2258,13 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
 
 /// Try to emit a combined compare-and-branch instruction.
 bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  if (FuncInfo.MF->getFunction().hasFnAttribute(
+          Attribute::SpeculativeLoadHardening))
+    return false;
+
   assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
   const CmpInst *CI = cast<CmpInst>(BI->getCondition());
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b539417941..cc10c9688e1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4343,6 +4343,13 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  bool ProduceNonFlagSettingCondBr =
+      !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+
   // Handle f128 first, since lowering it will result in comparing the return
   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   // is expecting to deal with.
@@ -4385,7 +4392,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     // If the RHS of the comparison is zero, we can potentially fold this
     // to a specialized branch.
     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
+    if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
       if (CC == ISD::SETEQ) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
@@ -4428,7 +4435,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
-        LHS.getOpcode() != ISD::AND) {
+        LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
       // Don't combine AND since emitComparison converts the AND to an ANDS
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
@@ -10807,6 +10814,13 @@ SDValue performCONDCombine(SDNode *N,
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
+  // will not be produced, as they are conditional branch instructions that do
+  // not set flags.
+  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    return SDValue();
+
   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index eddb349f0bf..10464ea57bb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -964,6 +964,13 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                             const MachineFunction &MF) const {
   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
     return true;
+  switch (MI.getOpcode()) {
+  case AArch64::DSB:
+  case AArch64::ISB:
+    // DSB and ISB also are scheduling barriers.
+    return true;
+  default:;
+  }
   return isSEHInstruction(MI);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 90258cc1555..6cbfb6ab161 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -788,16 +788,36 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
-    if (selectCompareBranch(I, MF, MRI))
+    // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
+    // instructions will not be produced, as they are conditional branch
+    // instructions that do not set flags.
+    bool ProduceNonFlagSettingCondBr =
+        !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+    if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
       return true;
 
-    auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
-                   .addUse(CondReg)
-                   .addImm(/*bit offset=*/0)
-                   .addMBB(DestMBB);
+    if (ProduceNonFlagSettingCondBr) {
+      auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
+                     .addUse(CondReg)
+                     .addImm(/*bit offset=*/0)
+                     .addMBB(DestMBB);
 
-    I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+    } else {
+      auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
+                     .addDef(AArch64::WZR)
+                     .addUse(CondReg)
+                     .addImm(1);
+      constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI);
+      auto Bcc =
+          BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc))
+              .addImm(AArch64CC::EQ)
+              .addMBB(DestMBB);
+
+      I.eraseFromParent();
+      return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI);
+    }
   }
 
   case TargetOpcode::G_BRINDIRECT: {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 55631bcba23..96ae45ae3d0 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -203,6 +203,10 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
 
+  // SLH uses register W16/X16 as the taint register.
+  if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    markSuperRegs(Reserved, AArch64::W16);
+
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
new file mode 100644
index 00000000000..1f8ef5ee6ea
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -0,0 +1,368 @@
+//===- AArch64SpeculationHardening.cpp - Harden Against Missspeculation  --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under control flow miss-speculation.
+//
+// The pass implements tracking of control flow miss-speculation into a "taint"
+// register. That taint register can then be used to mask off registers with
+// sensitive data when executing under miss-speculation, a.k.a. "transient
+// execution".
+// This pass is aimed at mitigating against SpectreV1-style vulnarabilities.
+//
+// At the moment, it implements the tracking of miss-speculation of control
+// flow into a taint register, but doesn't implement a mechanism yet to then
+// use that taint register to mask of vulnerable data in registers (something
+// for a follow-on improvement). Possible strategies to mask out vulnerable
+// data that can be implemented on top of this are:
+// - speculative load hardening to automatically mask of data loaded
+//   in registers.
+// - using intrinsics to mask of data in registers as indicated by the
+//   programmer (see https://lwn.net/Articles/759423/).
+//
+// For AArch64, the following implementation choices are made below.
+// Some of these are different than the implementation choices made in
+// the similar pass implemented in X86SpeculativeLoadHardening.cpp, as
+// the instruction set characteristics result in different trade-offs.
+// - The speculation hardening is done after register allocation. With a
+//   relative abundance of registers, one register is reserved (X16) to be
+//   the taint register. X16 is expected to not clash with other register
+//   reservation mechanisms with very high probability because:
+//   . The AArch64 ABI doesn't guarantee X16 to be retained across any call.
+//   . The only way to request X16 to be used as a programmer is through
+//     inline assembly. In the rare case a function explicitly demands to
+//     use X16/W16, this pass falls back to hardening against speculation
+//     by inserting a DSB SYS/ISB barrier pair which will prevent control
+//     flow speculation.
+// - It is easy to insert mask operations at this late stage as we have
+//   mask operations available that don't set flags.
+// - The taint variable contains all-ones when no miss-speculation is detected,
+//   and contains all-zeros when miss-speculation is detected. Therefore, when
+//   masking, an AND instruction (which only changes the register to be masked,
+//   no other side effects) can easily be inserted anywhere that's needed.
+// - The tracking of miss-speculation is done by using a data-flow conditional
+//   select instruction (CSEL) to evaluate the flags that were also used to
+//   make conditional branch direction decisions. Speculation of the CSEL
+//   instruction can be limited with a CSDB instruction - so the combination of
+//   CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL
+//   aren't speculated. When conditional branch direction gets miss-speculated,
+//   the semantics of the inserted CSEL instruction is such that the taint
+//   register will contain all zero bits.
+//   One key requirement for this to work is that the conditional branch is
+//   followed by an execution of the CSEL instruction, where the CSEL
+//   instruction needs to use the same flags status as the conditional branch.
+//   This means that the conditional branches must not be implemented as one
+//   of the AArch64 conditional branches that do not use the flags as input
+//   (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction
+//   selectors to not produce these instructions when speculation hardening
+//   is enabled. This pass will assert if it does encounter such an instruction.
+// - On function call boundaries, the miss-speculation state is transferred from
+//   the taint register X16 to be encoded in the SP register as value 0.
+//
+// Future extensions/improvements could be:
+// - Implement this functionality using full speculation barriers, akin to the
+//   x86-slh-lfence option. This may be more useful for the intrinsics-based
+//   approach than for the SLH approach to masking.
+//   Note that this pass already inserts the full speculation barriers if the
+//   function for some niche reason makes use of X16/W16.
+// - no indirect branch misprediction gets protected/instrumented; but this
+//   could be done for some indirect branches, such as switch jump tables.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-speculation-hardening"
+
+#define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
+
+namespace {
+
+class AArch64SpeculationHardening : public MachineFunctionPass {
+public:
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  static char ID;
+
+  AArch64SpeculationHardening() : MachineFunctionPass(ID) {
+    initializeAArch64SpeculationHardeningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override {
+    return AARCH64_SPECULATION_HARDENING_NAME;
+  }
+
+private:
+  unsigned MisspeculatingTaintReg;
+  bool UseControlFlowSpeculationBarrier;
+
+  bool functionUsesHardeningRegister(MachineFunction &MF) const;
+  bool instrumentControlFlow(MachineBasicBlock &MBB);
+  bool endsWithCondControlFlow(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                               MachineBasicBlock *&FBB,
+                               AArch64CC::CondCode &CondCode) const;
+  void insertTrackingCode(MachineBasicBlock &SplitEdgeBB,
+                          AArch64CC::CondCode &CondCode, DebugLoc DL) const;
+  void insertSPToRegTaintPropagation(MachineBasicBlock *MBB,
+                                     MachineBasicBlock::iterator MBBI) const;
+  void insertRegToSPTaintPropagation(MachineBasicBlock *MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned TmpReg) const;
+};
+
+} // end anonymous namespace
+
+char AArch64SpeculationHardening::ID = 0;
+
+INITIALIZE_PASS(AArch64SpeculationHardening, "aarch64-speculation-hardening",
+                AARCH64_SPECULATION_HARDENING_NAME, false, false)
+
+bool AArch64SpeculationHardening::endsWithCondControlFlow(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    AArch64CC::CondCode &CondCode) const {
+  SmallVector<MachineOperand, 1> analyzeBranchCondCode;
+  if (TII->analyzeBranch(MBB, TBB, FBB, analyzeBranchCondCode, false))
+    return false;
+
+  // Ignore if the BB ends in an unconditional branch/fall-through.
+  if (analyzeBranchCondCode.empty())
+    return false;
+
+  // If the BB ends with a single conditional branch, FBB will be set to
+  // nullptr (see API docs for TII->analyzeBranch). For the rest of the
+  // analysis we want the FBB block to be set always.
+  assert(TBB != nullptr);
+  if (FBB == nullptr)
+    FBB = MBB.getFallThrough();
+
+  // If both the true and the false condition jump to the same basic block,
+  // there isn't need for any protection - whether the branch is speculated
+  // correctly or not, we end up executing the architecturally correct code.
+  if (TBB == FBB)
+    return false;
+
+  assert(MBB.succ_size() == 2);
+  // translate analyzeBranchCondCode to CondCode.
+  assert(analyzeBranchCondCode.size() == 1 && "unknown Cond array format");
+  CondCode = AArch64CC::CondCode(analyzeBranchCondCode[0].getImm());
+  return true;
+}
+
+void AArch64SpeculationHardening::insertTrackingCode(
+    MachineBasicBlock &SplitEdgeBB, AArch64CC::CondCode &CondCode,
+    DebugLoc DL) const {
+  if (UseControlFlowSpeculationBarrier) {
+    // insert full control flow speculation barrier (DSB SYS + ISB)
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::ISB))
+        .addImm(0xf);
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::DSB))
+        .addImm(0xf);
+  } else {
+    BuildMI(SplitEdgeBB, SplitEdgeBB.begin(), DL, TII->get(AArch64::CSELXr))
+        .addDef(MisspeculatingTaintReg)
+        .addUse(MisspeculatingTaintReg)
+        .addUse(AArch64::XZR)
+        .addImm(CondCode);
+    SplitEdgeBB.addLiveIn(AArch64::NZCV);
+  }
+}
+
+bool AArch64SpeculationHardening::instrumentControlFlow(
+    MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "Instrument control flow tracking on MBB: " << MBB);
+
+  bool Modified = false;
+  MachineBasicBlock *TBB = nullptr;
+  MachineBasicBlock *FBB = nullptr;
+  AArch64CC::CondCode CondCode;
+
+  if (!endsWithCondControlFlow(MBB, TBB, FBB, CondCode)) {
+    LLVM_DEBUG(dbgs() << "... doesn't end with CondControlFlow\n");
+  } else {
+    // Now insert:
+    // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, cond" on the True edge and
+    // "CSEL MisSpeculatingR, MisSpeculatingR, XZR, Invertcond" on the False
+    // edge.
+    AArch64CC::CondCode InvCondCode = AArch64CC::getInvertedCondCode(CondCode);
+
+    MachineBasicBlock *SplitEdgeTBB = MBB.SplitCriticalEdge(TBB, *this);
+    MachineBasicBlock *SplitEdgeFBB = MBB.SplitCriticalEdge(FBB, *this);
+
+    assert(SplitEdgeTBB != nullptr);
+    assert(SplitEdgeFBB != nullptr);
+
+    DebugLoc DL;
+    if (MBB.instr_end() != MBB.instr_begin())
+      DL = (--MBB.instr_end())->getDebugLoc();
+
+    insertTrackingCode(*SplitEdgeTBB, CondCode, DL);
+    insertTrackingCode(*SplitEdgeFBB, InvCondCode, DL);
+
+    LLVM_DEBUG(dbgs() << "SplitEdgeTBB: " << *SplitEdgeTBB << "\n");
+    LLVM_DEBUG(dbgs() << "SplitEdgeFBB: " << *SplitEdgeFBB << "\n");
+    Modified = true;
+  }
+
+  // Perform correct code generation around function calls and before returns.
+  {
+    SmallVector<MachineInstr *, 4> ReturnInstructions;
+    SmallVector<MachineInstr *, 4> CallInstructions;
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.isReturn())
+        ReturnInstructions.push_back(&MI);
+      else if (MI.isCall())
+        CallInstructions.push_back(&MI);
+    }
+
+    Modified |=
+        (ReturnInstructions.size() > 0) || (CallInstructions.size() > 0);
+
+    for (MachineInstr *Return : ReturnInstructions)
+      insertRegToSPTaintPropagation(Return->getParent(), Return, AArch64::X17);
+    for (MachineInstr *Call : CallInstructions) {
+      // Just after the call:
+      MachineBasicBlock::iterator i = Call;
+      i++;
+      insertSPToRegTaintPropagation(Call->getParent(), i);
+      // Just before the call:
+      insertRegToSPTaintPropagation(Call->getParent(), Call, AArch64::X17);
+    }
+  }
+
+  return Modified;
+}
+
+void AArch64SpeculationHardening::insertSPToRegTaintPropagation(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) const {
+  // If full control flow speculation barriers are used, emit a control flow
+  // barrier to block potential miss-speculation in flight coming in to this
+  // function.
+  if (UseControlFlowSpeculationBarrier) {
+    // insert full control flow speculation barrier (DSB SYS + ISB)
+    BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::DSB)).addImm(0xf);
+    BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ISB)).addImm(0xf);
+    return;
+  }
+
+  // CMP   SP, #0   === SUBS   xzr, SP, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::SUBSXri))
+      .addDef(AArch64::XZR)
+      .addUse(AArch64::SP)
+      .addImm(0)
+      .addImm(0); // no shift
+  // CSETM x16, NE  === CSINV  x16, xzr, xzr, EQ
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::CSINVXr))
+      .addDef(MisspeculatingTaintReg)
+      .addUse(AArch64::XZR)
+      .addUse(AArch64::XZR)
+      .addImm(AArch64CC::EQ);
+}
+
+void AArch64SpeculationHardening::insertRegToSPTaintPropagation(
+    MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI,
+    unsigned TmpReg) const {
+  // If full control flow speculation barriers are used, there will not be
+  // miss-speculation when returning from this function, and therefore, also
+  // no need to encode potential miss-speculation into the stack pointer.
+  if (UseControlFlowSpeculationBarrier)
+    return;
+
+  // mov   Xtmp, SP  === ADD  Xtmp, SP, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+      .addDef(TmpReg)
+      .addUse(AArch64::SP)
+      .addImm(0)
+      .addImm(0); // no shift
+  // and   Xtmp, Xtmp, TaintReg === AND Xtmp, Xtmp, TaintReg, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ANDXrs))
+      .addDef(TmpReg, RegState::Renamable)
+      .addUse(TmpReg, RegState::Kill | RegState::Renamable)
+      .addUse(MisspeculatingTaintReg, RegState::Kill)
+      .addImm(0);
+  // mov   SP, Xtmp === ADD SP, Xtmp, #0
+  BuildMI(*MBB, MBBI, DebugLoc(), TII->get(AArch64::ADDXri))
+      .addDef(AArch64::SP)
+      .addUse(TmpReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0); // no shift
+}
+
+bool AArch64SpeculationHardening::functionUsesHardeningRegister(
+    MachineFunction &MF) const {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // treat function calls specially, as the hardening register does not
+      // need to remain live across function calls.
+      if (MI.isCall())
+        continue;
+      if (MI.readsRegister(MisspeculatingTaintReg, TRI) ||
+          MI.modifiesRegister(MisspeculatingTaintReg, TRI))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
+    return false;
+
+  MisspeculatingTaintReg = AArch64::X16;
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  bool Modified = false;
+
+  UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF);
+
+  // Instrument control flow speculation tracking, if requested.
+  LLVM_DEBUG(
+      dbgs()
+      << "***** AArch64SpeculationHardening - track control flow *****\n");
+
+  // 1. Add instrumentation code to function entry and exits.
+  SmallVector<MachineBasicBlock *, 2> EntryBlocks;
+  EntryBlocks.push_back(&MF.front());
+  for (const LandingPadInfo &LPI : MF.getLandingPads())
+    EntryBlocks.push_back(LPI.LandingPadBlock);
+  for (auto Entry : EntryBlocks)
+    insertSPToRegTaintPropagation(
+        Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin()));
+
+  // 2. Add instrumentation code to every basic block.
+  for (auto &MBB : MF)
+    Modified |= instrumentControlFlow(MBB);
+
+  return Modified;
+}
+
+/// \brief Returns an instance of the pseudo instruction expansion pass.
+FunctionPass *llvm::createAArch64SpeculationHardeningPass() {
+  return new AArch64SpeculationHardening();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 32c853483e3..4e016525f7e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -177,6 +177,7 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeFalkorHWPFFixPass(*PR);
   initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
+  initializeAArch64SpeculationHardeningPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -550,6 +551,16 @@ void AArch64PassConfig::addPreSched2() {
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableLoadStoreOpt)
       addPass(createAArch64LoadStoreOptimizationPass());
+  }
+
+  // The AArch64SpeculationHardeningPass destroys dominator tree and natural
+  // loop info, which is needed for the FalkorHWPFFixPass and also later on.
+  // Therefore, run the AArch64SpeculationHardeningPass before the
+  // FalkorHWPFFixPass to avoid recomputing dominator tree and natural loop
+  // info.
+  addPass(createAArch64SpeculationHardeningPass());
+
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableFalkorHWPFFix)
       addPass(createFalkorHWPFFixPass());
   }
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 9c8c1d0e0ff..7778882d491 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_target(AArch64CodeGen
   AArch64RegisterBankInfo.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64SpeculationHardening.cpp
   AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index d85d126883c..6d0aa91272b 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -50,6 +50,7 @@
 ; CHECK-NEXT:       Prologue/Epilogue Insertion & Frame Finalization
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
+; CHECK-NEXT:       AArch64 speculation hardening pass
 ; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       AArch64 Branch Targets
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 29682b7b2d1..98cef01b6a9 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -146,6 +146,7 @@
 ; CHECK-NEXT:       Post-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
 ; CHECK-NEXT:       AArch64 load / store optimization pass
+; CHECK-NEXT:       AArch64 speculation hardening pass
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Falkor HW Prefetch Fix Late Phase
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-dagisel.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-dagisel.ll
new file mode 100644
index 00000000000..4d13d98441e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening-dagisel.ll
@@ -0,0 +1,71 @@
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+
+declare i64 @g(i64, i64) local_unnamed_addr
+define i64 @f_using_reserved_reg_x16(i64 %a, i64 %b) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f_using_reserved_reg_x16
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+entry:
+  %cmp = icmp ugt i64 %a, %b
+  br i1 %cmp, label %if.then, label %cleanup
+
+; CHECK: b.ls
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+if.then:
+  %0 = tail call i64 asm "autia1716", "={x17},{x16},0"(i64 %b, i64 %a)
+; CHECK: bl g
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; CHECK: ret
+  %call = tail call i64 @g(i64 %a, i64 %b) #3
+  %add = add i64 %call, %0
+  br label %cleanup
+
+cleanup:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; SLH: ret
+  %retval.0 = phi i64 [ %add, %if.then ], [ %b, %entry ]
+  ret i64 %retval.0
+}
+
+define i32 @f_clobbered_reg_w16(i32 %a, i32 %b) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f_clobbered_reg_w16
+entry:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: b.le
+
+if.then:
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; CHECK: mov w16, w0
+  tail call void asm sideeffect "mov w16, ${0:w}", "r,~{w16}"(i32 %a)
+  br label %if.end
+; SLH: ret
+
+if.end:
+  %add = add nsw i32 %b, %a
+  ret i32 %add
+; SLH: dsb sy
+; SLH: isb
+; NOSLH-NOT: dsb sy
+; NOSLH-NOT: isb
+; SLH: ret
+}
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening.ll b/llvm/test/CodeGen/AArch64/speculation-hardening.ll
new file mode 100644
index 00000000000..3535b63c32c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening.ll
@@ -0,0 +1,156 @@
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR/speculative_load_hardening/' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,SLH --dump-input-on-failure
+; RUN: sed -e 's/SLHATTR//' %s | llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -fast-isel | FileCheck %s --check-prefixes=CHECK,NOSLH --dump-input-on-failure
+
+define i32 @f(i8* nocapture readonly %p, i32 %i, i32 %N) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: f
+entry:
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+
+; SLH:  mov x17, sp
+; SLH:  and x17, x17, x16
+; SLH:  mov sp, x17
+; NOSLH-NOT:  mov x17, sp
+; NOSLH-NOT:  and x17, x17, x16
+; NOSLH-NOT:  mov sp, x17
+  %call = tail call i32 @tail_callee(i32 %i)
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+  %cmp = icmp slt i32 %call, %N
+  br i1 %cmp, label %if.then, label %return
+; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected..
+; CHECK: b.[[COND:(ge)|(lt)|(ne)]]
+
+if.then:                                          ; preds = %entry
+; NOSLH-NOT: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}}
+; SLH-DAG: csel x16, x16, xzr, {{(lt)|(ge)|(eq)}}
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %idxprom
+  %0 = load i8, i8* %arrayidx, align 1
+; CHECK-DAG:      ldrb [[LOADED:w[0-9]+]],
+  %conv = zext i8 %0 to i32
+  br label %return
+
+; SLH-DAG: csel x16, x16, xzr, [[COND]]
+; NOSLH-NOT: csel x16, x16, xzr, [[COND]]
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %conv, %if.then ], [ 0, %entry ]
+; SLH:  mov x17, sp
+; SLH:  and x17, x17, x16
+; SLH:  mov sp, x17
+; NOSLH-NOT:  mov x17, sp
+; NOSLH-NOT:  and x17, x17, x16
+; NOSLH-NOT:  mov sp, x17
+  ret i32 %retval.0
+}
+
+; Make sure that for a tail call, taint doesn't get put into SP twice.
+define i32 @tail_caller(i32 %a) local_unnamed_addr SLHATTR {
+; CHECK-LABEL: tail_caller:
+; SLH:     mov     x17, sp
+; SLH:     and     x17, x17, x16
+; SLH:     mov     sp, x17
+; NOSLH-NOT:     mov     x17, sp
+; NOSLH-NOT:     and     x17, x17, x16
+; NOSLH-NOT:     mov     sp, x17
+;  GlobalISel doesn't optimize tail calls (yet?), so only check that
+;  cross-call taint register setup code is missing if a tail call was
+;  actually produced.
+; SLH:     {{(bl tail_callee[[:space:]] cmp sp, #0)|(b tail_callee)}}
+; SLH-NOT: cmp sp, #0
+  %call = tail call i32 @tail_callee(i32 %a)
+  ret i32 %call
+}
+
+declare i32 @tail_callee(i32) local_unnamed_addr
+
+; Verify that no cb(n)z/tb(n)z instructions are produced when implementing
+; SLH
+define i32 @compare_branch_zero(i32, i32) SLHATTR {
+; CHECK-LABEL: compare_branch_zero
+  %3 = icmp eq i32 %0, 0
+  br i1 %3, label %then, label %else
+;SLH-NOT:   cb{{n?}}z
+;NOSLH:     cb{{n?}}z
+then:
+  %4 = sdiv i32 5, %1
+  ret i32 %4
+else:
+  %5 = sdiv i32 %1, %0
+  ret i32 %5
+}
+
+define i32 @test_branch_zero(i32, i32) SLHATTR {
+; CHECK-LABEL: test_branch_zero
+  %3 = and i32 %0, 16
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %then, label %else
+;SLH-NOT:   tb{{n?}}z
+;NOSLH:     tb{{n?}}z
+then:
+  %5 = sdiv i32 5, %1
+  ret i32 %5
+else:
+  %6 = sdiv i32 %1, %0
+  ret i32 %6
+}
+
+define i32 @landingpad(i32 %l0, i32 %l1) SLHATTR personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: landingpad
+entry:
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+; CHECK: bl _Z10throwing_fv
+  invoke void @_Z10throwing_fv()
+          to label %exit unwind label %lpad
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+
+lpad:
+  %l4 = landingpad { i8*, i32 }
+          catch i8* null
+; SLH:  cmp sp, #0
+; SLH:  csetm x16, ne
+; NOSLH-NOT:  cmp sp, #0
+; NOSLH-NOT:  csetm x16, ne
+  %l5 = extractvalue { i8*, i32 } %l4, 0
+  %l6 = tail call i8* @__cxa_begin_catch(i8* %l5)
+  %l7 = icmp sgt i32 %l0, %l1
+  br i1 %l7, label %then, label %else
+; GlobalISel lowers the branch to a b.ne sometimes instead of b.ge as expected..
+; CHECK: b.[[COND:(le)|(gt)|(ne)]]
+
+then:
+; SLH-DAG: csel x16, x16, xzr, [[COND]]
+  %l9 = sdiv i32 %l0, %l1
+  br label %postif
+
+else:
+; SLH-DAG: csel x16, x16, xzr, {{(gt)|(le)|(eq)}}
+  %l11 = sdiv i32 %l1, %l0
+  br label %postif
+
+postif:
+  %l13 = phi i32 [ %l9, %then ], [ %l11, %else ]
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  %l15 = phi i32 [ %l13, %postif ], [ 0, %entry ]
+  ret i32 %l15
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare void @_Z10throwing_fv() local_unnamed_addr
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+declare void @__cxa_end_catch() local_unnamed_addr
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening.mir b/llvm/test/CodeGen/AArch64/speculation-hardening.mir
new file mode 100644
index 00000000000..cf8357d9558
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening.mir
@@ -0,0 +1,117 @@
+# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \
+# RUN:     -start-before aarch64-speculation-hardening -o - %s \
+# RUN:   | FileCheck %s --dump-input-on-failure
+
+# Check that the speculation hardening pass generates code as expected for
+# basic blocks ending with a variety of branch patterns:
+# - (1) no branches (fallthrough)
+# - (2) one unconditional branch
+# - (3) one conditional branch + fall-through
+# - (4) one conditional branch + one unconditional branch
+# - other direct branches don't seem to be generated by the AArch64 codegen
+--- |
+  define void @nobranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @uncondbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @condbranch_fallthrough(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @condbranch_uncondbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+  define void @indirectbranch(i32 %a, i32 %b) speculative_load_hardening {
+   ret void
+  }
+...
+---
+name:            nobranch_fallthrough
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: nobranch_fallthrough
+  bb.0:
+    successors: %bb.1
+    liveins: $w0, $w1
+  ; CHECK-NOT: csel
+  bb.1:
+    liveins: $w0
+   RET undef $lr, implicit $w0
+...
+---
+name:            uncondbranch
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: uncondbranch
+  bb.0:
+    successors: %bb.1
+    liveins: $w0, $w1
+    B %bb.1
+  ; CHECK-NOT: csel
+  bb.1:
+   liveins: $w0
+   RET undef $lr, implicit $w0
+...
+---
+name:            condbranch_fallthrough
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: condbranch_fallthrough
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $w0, $w1
+    $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+  ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]]
+
+  bb.1:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, ge
+    RET undef $lr, implicit $w0
+  bb.2:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, lt
+    RET undef $lr, implicit $w0
+...
+---
+name:            condbranch_uncondbranch
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: condbranch_uncondbranch
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $w0, $w1
+    $wzr = SUBSWrs renamable $w0, renamable $w1, 0, implicit-def $nzcv, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1, implicit $nzcv
+  ; CHECK: b.lt [[BB_LT_T:\.LBB[0-9_]+]]
+
+  bb.1:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, ge
+    RET undef $lr, implicit $w0
+  bb.2:
+    liveins: $nzcv, $w0
+  ; CHECK: csel x16, x16, xzr, lt
+    RET undef $lr, implicit $w0
+...
+---
+name:            indirectbranch
+tracksRegLiveness: true
+body:             |
+  ; Check that no instrumentation is done on indirect branches (for now).
+  ; CHECK-LABEL: indirectbranch
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: $x0
+    BR $x0
+  bb.1:
+   liveins: $x0
+  ; CHECK-NOT: csel
+   RET undef $lr, implicit $x0
+  bb.2:
+   liveins: $x0
+  ; CHECK-NOT: csel
+   RET undef $lr, implicit $x0
+...