3 files changed, 225 insertions, 43 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 10faeb75b33..f7443c8577a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -650,50 +650,50 @@ int ARMTTIImpl::getArithmeticInstrCost(
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
-  const unsigned FunctionCallDivCost = 20;
-  const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry CostTbl[] = {
-    // Division.
-    // These costs are somewhat random. Choose a cost of 20 to indicate that
-    // vectorizing devision (added function call) is going to be very expensive.
-    // Double registers types.
-    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    // Quad register types.
-    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    // Multiplication.
-  };
-
   if (ST->hasNEON()) {
+    const unsigned FunctionCallDivCost = 20;
+    const unsigned ReciprocalDivCost = 10;
+    static const CostTblEntry CostTbl[] = {
+      // Division.
+      // These costs are somewhat random. Choose a cost of 20 to indicate that
+      // vectorizing devision (added function call) is going to be very expensive.
+      // Double registers types.
+      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      // Quad register types.
+      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      // Multiplication.
+    };
+
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
new file mode 100644
index 00000000000..8e48a7f7cf4
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -cost-model -analyze -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+
+define void @shl(i32 %a, i32 %b) {
+; CHECK-LABEL: 'shl'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = shl i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = shl i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = shl i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = shl i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = shl i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = shl i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
+define void @ashr(i32 %a, i32 %b) {
+; CHECK-LABEL: 'ashr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = ashr i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = ashr i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = ashr i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = ashr i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = ashr i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = ashr i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
+define void @lshr(i32 %a, i32 %b) {
+; CHECK-LABEL: 'lshr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ss = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xs = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ns = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %os = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %is = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %as = lshr i32 %a, 3
+  %ac = add i32 %b, %as
+  %ss = lshr i32 %a, 3
+  %sc = sub i32 %b, %ss
+  %xs = lshr i32 %a, 3
+  %xc = xor i32 %b, %xs
+  %ns = lshr i32 %a, 3
+  %nc = and i32 %b, %ns
+  %os = lshr i32 %a, 3
+  %oc = or i32 %b, %os
+  %is = lshr i32 %a, 3
+  %ic = icmp eq i32 %b, %is
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
new file mode 100644
index 00000000000..858d3d88099
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -enable-arm-maskedldst < %s -S -o - | FileCheck %s --check-prefix=CHECK
+; RUN: opt -loop-vectorize -enable-arm-maskedldst -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CHECK-COST
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; CHECK-LABEL: test
+; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %and515 = shl i32 %l41, 3
+; CHECK-COST: LV: Found an estimated cost of 1 for VF 1 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %and515 = shl i32 %l41, 3
+; CHECK-COST: LV: Found an estimated cost of 2 for VF 4 For instruction:   %l45 = and i32 %and515, 131072
+; CHECK: vector.body
+
+define void @test([101 x i32] *%src, i32 %N) #0 {
+entry:
+  br label %for.body386
+  
+for.body386:                                      ; preds = %entry, %l77
+  %add387 = phi i32 [ %inc532, %l77 ], [ 0, %entry ]
+  %arrayidx388 = getelementptr inbounds [101 x i32], [101 x i32]* %src, i32 0, i32 %add387
+  %l41 = load i32, i32* %arrayidx388, align 4
+  %l42 = and i32 %l41, 65535
+  %l43 = icmp eq i32 %l42, 0
+  br i1 %l43, label %l77, label %l44
+
+l44:                                               ; preds = %for.body386
+  %and515 = shl i32 %l41, 3
+  %l45 = and i32 %and515, 131072
+  %and506 = shl i32 %l41, 5
+  %l46 = and i32 %and506, 262144
+  %and497 = shl i32 %l41, 7
+  %l47 = and i32 %and497, 524288
+  %and488 = shl i32 %l41, 9
+  %l48 = and i32 %and488, 1048576
+  %and479 = shl i32 %l41, 11
+  %l49 = and i32 %and479, 2097152
+  %and470 = shl i32 %l41, 13
+  %l50 = and i32 %and470, 4194304
+  %and461 = shl i32 %l41, 15
+  %l51 = and i32 %and461, 8388608
+  %and452 = shl i32 %l41, 17
+  %l52 = and i32 %and452, 16777216
+  %and443 = shl i32 %l41, 19
+  %l53 = and i32 %and443, 33554432
+  %and434 = shl i32 %l41, 21
+  %l54 = and i32 %and434, 67108864
+  %and425 = shl i32 %l41, 23
+  %l55 = and i32 %and425, 134217728
+  %and416 = shl i32 %l41, 25
+  %l56 = and i32 %and416, 268435456
+  %and407 = shl i32 %l41, 27
+  %l57 = and i32 %and407, 536870912
+  %and398 = shl i32 %l41, 29
+  %l58 = and i32 %and398, 1073741824
+  %l59 = shl i32 %l41, 31
+  %l60 = or i32 %l59, %l41
+  %l61 = or i32 %l58, %l60
+  %l62 = or i32 %l57, %l61
+  %l63 = or i32 %l56, %l62
+  %l64 = or i32 %l55, %l63
+  %l65 = or i32 %l54, %l64
+  %l66 = or i32 %l53, %l65
+  %l67 = or i32 %l52, %l66
+  %l68 = or i32 %l51, %l67
+  %l69 = or i32 %l50, %l68
+  %l70 = or i32 %l49, %l69
+  %l71 = or i32 %l48, %l70
+  %l72 = or i32 %l47, %l71
+  %l73 = or i32 %l46, %l72
+  %l74 = or i32 %l45, %l73
+  %and524 = shl i32 %l41, 1
+  %l75 = and i32 %and524, 65536
+  %l76 = or i32 %l75, %l74
+  store i32 %l76, i32* %arrayidx388, align 4
+  br label %l77
+
+l77:                                               ; preds = %for.body386, %l44
+  %inc532 = add nuw nsw i32 %add387, 1
+  %exitcond649 = icmp eq i32 %inc532, %N
+  br i1 %exitcond649, label %exit, label %for.body386
+
+exit:
+  ret void
+}
+
+attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }