ARM & AArch64: make use of common cmpxchg idioms after expansion

The C and C++ semantics for compare_exchange require it to return a bool indicating success. This gets mapped to LLVM IR which follows each cmpxchg with an icmp of the value loaded against the desired value. When lowered to ldxr/stxr loops, this extra comparison is redundant: its results are implicit in the control-flow of the function. This commit makes two changes: it replaces that icmp with appropriate PHI nodes, and then makes sure earlyCSE is called after expansion to actually make use of the opportunities revealed. I've also added -{arm,aarch64}-enable-atomic-tidy options, so that existing fragile tests aren't perturbed too much by the change. Many of them either rely on undef/unreachable too pervasively to be restored to something well-defined (particularly while making sure they test the same obscure assert from many years ago), or depend on a particular CFG shape, which is disrupted by SimplifyCFG. rdar://problem/16227836 llvm-svn: 209883
author: Tim Northover <tnorthover@apple.com> 2014-05-30 10:09:59 +0000
committer: Tim Northover <tnorthover@apple.com> 2014-05-30 10:09:59 +0000
commit: b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6 (patch)
tree: 37a8539f9db4739dd57e7bdf39433594e822e2ec /llvm/lib
parent: 5070c18928e4b4855202ee327f537c7a1969051e (diff)
download: bcm5719-llvm-b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6.tar.gz
bcm5719-llvm-b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6.zip
3 files changed, 81 insertions, 8 deletions
diff --git a/llvm/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/llvm/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
index d995333971b..d6d9907be7e 100644
--- a/llvm/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
@@ -300,12 +300,50 @@ bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
 
-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
+  // Make sure later instructions don't get reordered with a fence if necessary.
   Builder.SetInsertPoint(BarrierBB);
   insertTrailingFence(Builder, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
+
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(FailureBB, FailureBB->begin());
+  BasicBlock *SuccessBB = FailureOrder == Monotonic ? BarrierBB : TryStoreBB;
+  PHINode *Success = 0, *Failure = 0;
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  for (auto User : CI->users()) {
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(User);
+    if (!ICmp)
+      continue;
+
+    // Because we know ICmp uses CI, we only need one operand to be the old
+    // value.
+    if (ICmp->getOperand(0) != CI->getCompareOperand() &&
+        ICmp->getOperand(1) != CI->getCompareOperand())
+      continue;
+
+    if (ICmp->getPredicate() == CmpInst::ICMP_EQ) {
+      if (!Success) {
+        Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+        Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+        Success->addIncoming(ConstantInt::getFalse(Ctx), LoopBB);
+      }
+      ICmp->replaceAllUsesWith(Success);
+    } else if (ICmp->getPredicate() == CmpInst::ICMP_NE) {
+      if (!Failure) {
+        Failure = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+        Failure->addIncoming(ConstantInt::getFalse(Ctx), SuccessBB);
+        Failure->addIncoming(ConstantInt::getTrue(Ctx), LoopBB);
+      }
+      ICmp->replaceAllUsesWith(Failure);
+    }
+  }
+
   CI->replaceAllUsesWith(Loaded);
   CI->eraseFromParent();
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 0b5dd2f067e..ba301e95538 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -53,6 +53,12 @@ static cl::opt<bool>
 EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
                    " optimization pass"), cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -113,6 +119,7 @@ public:
     return getTM<AArch64TargetMachine>();
   }
 
+  void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
@@ -135,6 +142,20 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AArch64PassConfig(this, PM);
 }
 
+void AArch64PassConfig::addIRPasses() {
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  // Cmpxchg instructions are often used with a subsequent comparison to
+  // determine whether it succeeded. We can exploit existing control-flow in
+  // ldrex/strex loops to simplify this, but it needs tidying up.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+    addPass(createCFGSimplificationPass());
+
+  TargetPassConfig::addIRPasses();
+}
+
 // Pass Pipeline Configuration
 bool AArch64PassConfig::addPreISel() {
   // Run promote constant before global merge, so that the promoted constants
@@ -146,10 +167,6 @@ bool AArch64PassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64AddressTypePromotionPass());
 
-  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
-  // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
-
   return false;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 887622705ed..6ef2ea4e103 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -28,6 +28,12 @@ DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
                    cl::desc("Inhibit optimization of S->D register accesses on A15"),
                    cl::init(false));
 
+static cl::opt<bool>
+EnableAtomicTidy("arm-atomic-cfg-tidy", cl::Hidden,
+                 cl::desc("Run SimplifyCFG after expanding atomic operations"
+                          " to make use of cmpxchg flow-based information"),
+                 cl::init(true));
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(TheARMLETarget);
@@ -213,6 +219,7 @@ public:
     return *getARMTargetMachine().getSubtargetImpl();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -225,11 +232,22 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(this, PM);
 }
 
-bool ARMPassConfig::addPreISel() {
+void ARMPassConfig::addIRPasses() {
   const ARMSubtarget *Subtarget = &getARMSubtarget();
-  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
     addPass(createAtomicExpandLoadLinkedPass(TM));
 
+    // Cmpxchg instructions are often used with a subsequent comparison to
+    // determine whether it succeeded. We can exploit existing control-flow in
+    // ldrex/strex loops to simplify this, but it needs tidying up.
+    if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
+      addPass(createCFGSimplificationPass());
+  }
+
+  TargetPassConfig::addIRPasses();
+}
+
+bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
author	Tim Northover <tnorthover@apple.com>	2014-05-30 10:09:59 +0000
committer	Tim Northover <tnorthover@apple.com>	2014-05-30 10:09:59 +0000
commit	b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6 (patch)
tree	37a8539f9db4739dd57e7bdf39433594e822e2ec /llvm/lib
parent	5070c18928e4b4855202ee327f537c7a1969051e (diff)
download	bcm5719-llvm-b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6.tar.gz bcm5719-llvm-b4ddc0845ab5260023e9afa3f7bc71a0bc731ae6.zip