3 files changed, 69 insertions, 1 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index a7871ea8db5..b5605c08dc4 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2820,6 +2820,45 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (tryV6T2BitfieldExtractOp(N, false))
       return;
 
+    // If an immediate is used in an AND node, it is possible that the immediate
+    // can be more optimally materialized when negated. If this is the case we
+    // can negate the immediate and use a BIC instead.
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+      uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+      // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+      // immediate can be negated and fit in the immediate operand of
+      // a t2BIC, don't do any manual transform here as this can be
+      // handled by the generic ISel machinery.
+      bool PreferImmediateEncoding =
+          Subtarget->hasThumb2() && !is_t2_so_imm(Imm) && is_t2_so_imm_not(Imm);
+      if (!PreferImmediateEncoding &&
+          ConstantMaterializationCost(Imm) >
+              ConstantMaterializationCost(~Imm)) {
+        // The current immediate costs more to materialize than a negated
+        // immediate, so negate the immediate and use a BIC.
+        SDValue NewImm =
+            CurDAG->getTargetConstant(~N1C->getZExtValue(), dl, MVT::i32);
+        CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+        if (!Subtarget->hasThumb2()) {
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+                           N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+          return;
+        } else {
+          SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N,
+                      CurDAG->getMachineNode(ARM::t2BICri, dl, MVT::i32, Ops));
+          return;
+        }
+      }
+    }
+
     // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
     // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
     // are entirely contributed by c2 and lower 16-bits are entirely contributed
@@ -2834,7 +2873,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (!Opc)
       break;
     SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    N1C = dyn_cast<ConstantSDNode>(N1);
     if (!N1C)
       break;
     if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
diff --git a/llvm/test/CodeGen/Thumb/bic_imm.ll b/llvm/test/CodeGen/Thumb/bic_imm.ll
new file mode 100644
index 00000000000..078c321b781
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb/bic_imm.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m0 -verify-machineinstrs | FileCheck --check-prefix CHECK-T1 %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -mcpu=cortex-m3 -verify-machineinstrs | FileCheck --check-prefix CHECK-T2 %s
+
+; CHECK-T1-LABEL: @i
+; CHECK-T2-LABEL: @i
+; CHECK-T1: bics r0, #275
+; CHECK-T2: bic r0, r0, #275
+define i32 @i(i32 %a) {
+entry:
+  %and = and i32 %a, -276
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/Thumb2/bicbfi.ll b/llvm/test/CodeGen/Thumb2/bicbfi.ll
new file mode 100644
index 00000000000..fcdb1225db5
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/bicbfi.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--linux-gnueabihf"
+
+; CHECK-LABEL: f:
+; CHECK: bic
+define void @f(i32* nocapture %b, i32* nocapture %c, i32 %a) {
+  %1 = and i32 %a, -4096
+  store i32 %1, i32* %c, align 4
+  %2 = and i32 %a, 4095
+  %3 = or i32 %2, 4096
+  %4 = load i32, i32* %b, align 4
+  %5 = add nsw i32 %4, %3
+  store i32 %5, i32* %b, align 4
+  ret void
+}
+\ No newline at end of file