AArch64: Macrofusion: Split features, add missing combinations.

AArch64InstrInfo::shouldScheduleAdjacent() determines whether two instruction can benefit from macroop fusion on apple CPUs. The list turned out to be incomplete: - the "rr" variants of the instructions were missing - even the "rs" variants can have shift value == 0 and behave like the "rr" variants This also splits the MacropFusion target feature into ArithmeticBccFusion and ArithmeticCbzFusion. Differential Revision: https://reviews.llvm.org/D25142 llvm-svn: 283243
author: Matthias Braun <matze@braunis.de> 2016-10-04 19:28:21 +0000
committer: Matthias Braun <matze@braunis.de> 2016-10-04 19:28:21 +0000
commit: 46a5238682fc69457c70cca0e9bb950140062b7d (patch)
tree: 77a796443e9fe292be22195e951528f84ea5363b /llvm/lib/Target
parent: a271d1a531ecfa5c1c71f644fc4aaebfd9ae4fbd (diff)
download: bcm5719-llvm-46a5238682fc69457c70cca0e9bb950140062b7d.tar.gz
bcm5719-llvm-46a5238682fc69457c70cca0e9bb950140062b7d.zip
3 files changed, 59 insertions, 11 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 5c66748cee6..2ff3cf45a84 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -94,9 +94,13 @@ def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
     "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
     "true", "Use alternative pattern for sextload convert to f32">;
 
-def FeatureMacroOpFusion : SubtargetFeature<
-    "macroop-fusion", "HasMacroOpFusion", "true",
-    "CPU supports macro op fusion">;
+def FeatureArithmeticBccFusion : SubtargetFeature<
+    "arith-bcc-fusion", "HasArithmeticBccFusion", "true",
+    "CPU fuses arithmetic+bcc operations">;
+
+def FeatureArithmeticCbzFusion : SubtargetFeature<
+    "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
+    "CPU fuses arithmetic + cbz/cbnz operations">;
 
 def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
@@ -204,7 +208,8 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                    FeatureCrypto,
                                    FeatureDisableLatencySchedHeuristic,
                                    FeatureFPARMv8,
-                                   FeatureMacroOpFusion,
+                                   FeatureArithmeticBccFusion,
+                                   FeatureArithmeticCbzFusion,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeatureSlowMisaligned128Store,
@@ -244,7 +249,7 @@ def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
-                                   FeatureMacroOpFusion,
+                                   FeatureArithmeticBccFusion,
                                    FeatureNEON,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 859f7828901..b26dbce1875 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1876,39 +1876,80 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
 
 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
                                               MachineInstr &Second) const {
-  if (Subtarget.hasMacroOpFusion()) {
+  if (Subtarget.hasArithmeticBccFusion()) {
     // Fuse CMN, CMP, TST followed by Bcc.
     unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
       switch (First.getOpcode()) {
       default:
         return false;
-      case AArch64::SUBSWri:
       case AArch64::ADDSWri:
-      case AArch64::ANDSWri:
-      case AArch64::SUBSXri:
+      case AArch64::ADDSWrr:
       case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
       case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
         return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
       }
     }
+  }
+  if (Subtarget.hasArithmeticCbzFusion()) {
     // Fuse ALU operations followed by CBZ/CBNZ.
+    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
       switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::ADDWri:
+      case AArch64::ADDWrr:
       case AArch64::ADDXri:
+      case AArch64::ADDXrr:
       case AArch64::ANDWri:
+      case AArch64::ANDWrr:
       case AArch64::ANDXri:
+      case AArch64::ANDXrr:
       case AArch64::EORWri:
+      case AArch64::EORWrr:
       case AArch64::EORXri:
+      case AArch64::EORXrr:
       case AArch64::ORRWri:
+      case AArch64::ORRWrr:
       case AArch64::ORRXri:
+      case AArch64::ORRXrr:
       case AArch64::SUBWri:
+      case AArch64::SUBWrr:
       case AArch64::SUBXri:
+      case AArch64::SUBXrr:
         return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !hasShiftedReg(Second);
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 9f51c6be635..a21dbd8322f 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -80,7 +80,8 @@ protected:
   bool Misaligned128StoreIsSlow = false;
   bool AvoidQuadLdStPairs = false;
   bool UseAlternateSExtLoadCVTF32Pattern = false;
-  bool HasMacroOpFusion = false;
+  bool HasArithmeticBccFusion = false;
+  bool HasArithmeticCbzFusion = false;
   bool DisableLatencySchedHeuristic = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
@@ -188,7 +189,8 @@ public:
   bool useAlternateSExtLoadCVTF32Pattern() const {
     return UseAlternateSExtLoadCVTF32Pattern;
   }
-  bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+  bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
+  bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
     return VectorInsertExtractBaseCost;
author	Matthias Braun <matze@braunis.de>	2016-10-04 19:28:21 +0000
committer	Matthias Braun <matze@braunis.de>	2016-10-04 19:28:21 +0000
commit	46a5238682fc69457c70cca0e9bb950140062b7d (patch)
tree	77a796443e9fe292be22195e951528f84ea5363b /llvm/lib/Target
parent	a271d1a531ecfa5c1c71f644fc4aaebfd9ae4fbd (diff)
download	bcm5719-llvm-46a5238682fc69457c70cca0e9bb950140062b7d.tar.gz bcm5719-llvm-46a5238682fc69457c70cca0e9bb950140062b7d.zip