diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86MacroFusion.cpp | 151 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/testb-je-fusion.ll | 69 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll | 5 | 
7 files changed, 161 insertions, 81 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index fa8dd8a59f0..c054379acf7 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -344,6 +344,12 @@ def FeatureERMSB            "ermsb", "HasERMSB", "true",            "REP MOVS/STOS are fast">; +// Bulldozer and newer processors can merge CMP/TEST (but not other +// instructions) with conditional branches. +def FeatureBranchFusion +    : SubtargetFeature<"branchfusion", "HasBranchFusion", "true", +                 "CMP/TEST can be fused with conditional branches">; +  // Sandy Bridge and newer processors have many instructions that can be  // fused with conditional branches and pass through the CPU as a single  // operation. @@ -810,7 +816,7 @@ def ProcessorFeatures {                                                        FeatureSlowSHLD,                                                        FeatureLAHFSAHF,                                                        FeatureFast11ByteNOP, -                                                      FeatureMacroFusion]; +                                                      FeatureBranchFusion];    list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;    // PileDriver @@ -860,7 +866,7 @@ def ProcessorFeatures {                                         FeatureLZCNT,                                         FeatureFastBEXTR,                                         FeatureFast15ByteNOP, -                                       FeatureMacroFusion, +                                       FeatureBranchFusion,                                         FeatureMMX,                                         FeatureMOVBE,                                         FeatureMWAITX, diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp index e5e80a2339a..a5186562229 100644 --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -18,59 +18,29 @@  using namespace llvm; -/// Check if the instr pair, FirstMI and SecondMI, should be fused -/// together. Given SecondMI, when FirstMI is unspecified, then check if -/// SecondMI may be part of a fused pair at all. -static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, -                                   const TargetSubtargetInfo &TSI, -                                   const MachineInstr *FirstMI, -                                   const MachineInstr &SecondMI) { -  const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI); -  // Check if this processor supports macro-fusion. -  if (!ST.hasMacroFusion()) -    return false; +namespace { -  enum { -    FuseTest, -    FuseCmp, -    FuseInc -  } FuseKind; +// The classification for the first instruction. +enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid }; -  unsigned FirstOpcode = FirstMI -                         ? FirstMI->getOpcode() -                         : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); -  unsigned SecondOpcode = SecondMI.getOpcode(); +// The classification for the second instruction (jump). +enum class JumpKind { +  // JE, JL, JG and variants. +  ELG, +  // JA, JB and variants. +  AB, +  // JS, JP, JO and variants. +  SPO, +  // Not a fusable jump. +  Invalid, +}; -  switch (SecondOpcode) { -  default: -    return false; -  case X86::JE_1: -  case X86::JNE_1: -  case X86::JL_1: -  case X86::JLE_1: -  case X86::JG_1: -  case X86::JGE_1: -    FuseKind = FuseInc; -    break; -  case X86::JB_1: -  case X86::JBE_1: -  case X86::JA_1: -  case X86::JAE_1: -    FuseKind = FuseCmp; -    break; -  case X86::JS_1: -  case X86::JNS_1: -  case X86::JP_1: -  case X86::JNP_1: -  case X86::JO_1: -  case X86::JNO_1: -    FuseKind = FuseTest; -    break; -  } +} // namespace -  switch (FirstOpcode) { +static FirstInstrKind classifyFirst(const MachineInstr &MI) { +  switch (MI.getOpcode()) {    default: -    return false; +    return FirstInstrKind::Invalid;    case X86::TEST8rr:    case X86::TEST16rr:    case X86::TEST32rr: @@ -83,6 +53,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,    case X86::TEST16mr:    case X86::TEST32mr:    case X86::TEST64mr: +    return FirstInstrKind::Test;    case X86::AND16ri:    case X86::AND16ri8:    case X86::AND16rm: @@ -98,7 +69,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,    case X86::AND8ri:    case X86::AND8rm:    case X86::AND8rr: -    return true; +    return FirstInstrKind::And;    case X86::CMP16ri:    case X86::CMP16ri8:    case X86::CMP16rm: @@ -118,6 +89,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,    case X86::CMP8rm:    case X86::CMP8rr:    case X86::CMP8mr: +    return FirstInstrKind::Cmp;    case X86::ADD16ri:    case X86::ADD16ri8:    case X86::ADD16ri8_DB: @@ -159,7 +131,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,    case X86::SUB8ri:    case X86::SUB8rm:    case X86::SUB8rr: -    return FuseKind == FuseCmp || FuseKind == FuseInc; +    return FirstInstrKind::ALU;    case X86::INC16r:    case X86::INC32r:    case X86::INC64r: @@ -168,10 +140,83 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,    case X86::DEC32r:    case X86::DEC64r:    case X86::DEC8r: -    return FuseKind == FuseInc; -  case X86::INSTRUCTION_LIST_END: -    return true; +    return FirstInstrKind::IncDec; +  } +} + +static JumpKind classifySecond(const MachineInstr &MI) { +  switch (MI.getOpcode()) { +  default: +    return JumpKind::Invalid; +  case X86::JE_1: +  case X86::JNE_1: +  case X86::JL_1: +  case X86::JLE_1: +  case X86::JG_1: +  case X86::JGE_1: +    return JumpKind::ELG; +  case X86::JB_1: +  case X86::JBE_1: +  case X86::JA_1: +  case X86::JAE_1: +    return JumpKind::AB; +  case X86::JS_1: +  case X86::JNS_1: +  case X86::JP_1: +  case X86::JNP_1: +  case X86::JO_1: +  case X86::JNO_1: +    return JumpKind::SPO; +  } +} + +/// Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, +                                   const TargetSubtargetInfo &TSI, +                                   const MachineInstr *FirstMI, +                                   const MachineInstr &SecondMI) { +  const X86Subtarget &ST = static_cast<const X86Subtarget &>(TSI); + +  // Check if this processor supports any kind of fusion. +  if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) +    return false; + +  const JumpKind BranchKind = classifySecond(SecondMI); + +  if (BranchKind == JumpKind::Invalid) +    return false; // Second cannot be fused with anything. + +  if (FirstMI == nullptr) +    return true; // We're only checking whether Second can be fused at all. + +  const FirstInstrKind TestKind = classifyFirst(*FirstMI); + +  if (ST.hasBranchFusion()) { +    // Branch fusion can merge CMP and TEST with all conditional jumps. +    return (TestKind == FirstInstrKind::Cmp || +            TestKind == FirstInstrKind::Test); +  } + +  if (ST.hasMacroFusion()) { +    // Macro Fusion rules are a bit more complex. See Agner Fog's +    // Microarchitecture table 9.2 "Instruction Fusion". +    switch (TestKind) { +    case FirstInstrKind::Test: +    case FirstInstrKind::And: +      return true; +    case FirstInstrKind::Cmp: +    case FirstInstrKind::ALU: +      return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB; +    case FirstInstrKind::IncDec: +      return BranchKind == JumpKind::ELG; +    case FirstInstrKind::Invalid: +      return false; +    }    } + +  llvm_unreachable("unknown branch fusion type");  }  namespace llvm { diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 8d330fa6f9a..2bd36d42c5c 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -297,6 +297,9 @@ protected:    /// True if the processor supports macrofusion.    bool HasMacroFusion = false; +  /// True if the processor supports branch fusion. +  bool HasBranchFusion = false; +    /// True if the processor has enhanced REP MOVSB/STOSB.    bool HasERMSB = false; @@ -642,6 +645,7 @@ public:    bool hasFastBEXTR() const { return HasFastBEXTR; }    bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }    bool hasMacroFusion() const { return HasMacroFusion; } +  bool hasBranchFusion() const { return HasBranchFusion; }    bool hasERMSB() const { return HasERMSB; }    bool hasSlowDivide32() const { return HasSlowDivide32; }    bool hasSlowDivide64() const { return HasSlowDivide64; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 16128ceb92b..134915128d2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2984,7 +2984,7 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,  }  bool X86TTIImpl::canMacroFuseCmp() { -  return ST->hasMacroFusion(); +  return ST->hasMacroFusion() || ST->hasBranchFusion();  }  bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index fc9f2763332..e9afa02e67b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -59,6 +59,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {        X86::FeatureLEAForSP,        X86::FeatureLEAUsesAG,        X86::FeatureLZCNTFalseDeps, +      X86::FeatureBranchFusion,        X86::FeatureMacroFusion,        X86::FeatureMergeToThreeWayBranch,        X86::FeaturePadShortFunctions, diff --git a/llvm/test/CodeGen/X86/testb-je-fusion.ll b/llvm/test/CodeGen/X86/testb-je-fusion.ll index cc094856ef6..3028d04ba76 100644 --- a/llvm/test/CodeGen/X86/testb-je-fusion.ll +++ b/llvm/test/CodeGen/X86/testb-je-fusion.ll @@ -1,6 +1,7 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion | FileCheck %s --check-prefix=NOFUSION -; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion | FileCheck %s --check-prefix=MACROFUSION +; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,-branchfusion | FileCheck %s --check-prefix=NOFUSION +; RUN: llc < %s -mtriple=x86_64-- -mattr=-macrofusion,+branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=BRANCHFUSIONONLY +; RUN: llc < %s -mtriple=x86_64-- -mattr=+macrofusion,-branchfusion | FileCheck %s --check-prefix=BRANCHFUSION --check-prefix=MACROFUSION  ; testb should be scheduled right before je to enable macro-fusion. @@ -16,16 +17,16 @@ define i32 @macrofuse_test_je(i32 %flags, i8* %p) nounwind {  ; NOFUSION-NEXT:  .LBB0_2: # %if.end  ; NOFUSION-NEXT:    retq  ; -; MACROFUSION-LABEL: macrofuse_test_je: -; MACROFUSION:       # %bb.0: # %entry -; MACROFUSION-NEXT:    xorl %eax, %eax -; MACROFUSION-NEXT:    movb $1, (%rsi) -; MACROFUSION-NEXT:    testl $512, %edi # imm = 0x200 -; MACROFUSION-NEXT:    je .LBB0_2 -; MACROFUSION-NEXT:  # %bb.1: # %if.then -; MACROFUSION-NEXT:    movl $1, %eax -; MACROFUSION-NEXT:  .LBB0_2: # %if.end -; MACROFUSION-NEXT:    retq +; BRANCHFUSION-LABEL: macrofuse_test_je: +; BRANCHFUSION:       # %bb.0: # %entry +; BRANCHFUSION-NEXT:    xorl %eax, %eax +; BRANCHFUSION-NEXT:    movb $1, (%rsi) +; BRANCHFUSION-NEXT:    testl $512, %edi # imm = 0x200 +; BRANCHFUSION-NEXT:    je .LBB0_2 +; BRANCHFUSION-NEXT:  # %bb.1: # %if.then +; BRANCHFUSION-NEXT:    movl $1, %eax +; BRANCHFUSION-NEXT:  .LBB0_2: # %if.end +; BRANCHFUSION-NEXT:    retq  entry:    %and = and i32 %flags, 512    %tobool = icmp eq i32 %and, 0 @@ -53,17 +54,17 @@ define i32 @macrofuse_cmp_je(i32 %flags, i8* %p) nounwind {  ; NOFUSION-NEXT:    xorl %eax, %eax  ; NOFUSION-NEXT:    retq  ; -; MACROFUSION-LABEL: macrofuse_cmp_je: -; MACROFUSION:       # %bb.0: # %entry -; MACROFUSION-NEXT:    movb $1, (%rsi) -; MACROFUSION-NEXT:    cmpl $512, %edi # imm = 0x200 -; MACROFUSION-NEXT:    je .LBB1_1 -; MACROFUSION-NEXT:  # %bb.2: # %if.then -; MACROFUSION-NEXT:    movl $1, %eax -; MACROFUSION-NEXT:    retq -; MACROFUSION-NEXT:  .LBB1_1: -; MACROFUSION-NEXT:    xorl %eax, %eax -; MACROFUSION-NEXT:    retq +; BRANCHFUSION-LABEL: macrofuse_cmp_je: +; BRANCHFUSION:       # %bb.0: # %entry +; BRANCHFUSION-NEXT:    movb $1, (%rsi) +; BRANCHFUSION-NEXT:    cmpl $512, %edi # imm = 0x200 +; BRANCHFUSION-NEXT:    je .LBB1_1 +; BRANCHFUSION-NEXT:  # %bb.2: # %if.then +; BRANCHFUSION-NEXT:    movl $1, %eax +; BRANCHFUSION-NEXT:    retq +; BRANCHFUSION-NEXT:  .LBB1_1: +; BRANCHFUSION-NEXT:    xorl %eax, %eax +; BRANCHFUSION-NEXT:    retq  entry:    %sub = sub i32 %flags, 512    %tobool = icmp eq i32 %sub, 0 @@ -90,6 +91,17 @@ define i32 @macrofuse_alu_je(i32 %flags, i8* %p) nounwind {  ; NOFUSION-NEXT:  .LBB2_2: # %if.end  ; NOFUSION-NEXT:    retq  ; +; BRANCHFUSIONONLY-LABEL: macrofuse_alu_je: +; BRANCHFUSIONONLY:       # %bb.0: # %entry +; BRANCHFUSIONONLY-NEXT:    movl %edi, %eax +; BRANCHFUSIONONLY-NEXT:    addl $-512, %eax # imm = 0xFE00 +; BRANCHFUSIONONLY-NEXT:    movb $1, (%rsi) +; BRANCHFUSIONONLY-NEXT:    je .LBB2_2 +; BRANCHFUSIONONLY-NEXT:  # %bb.1: # %if.then +; BRANCHFUSIONONLY-NEXT:    movl $1, %eax +; BRANCHFUSIONONLY-NEXT:  .LBB2_2: # %if.end +; BRANCHFUSIONONLY-NEXT:    retq +;  ; MACROFUSION-LABEL: macrofuse_alu_je:  ; MACROFUSION:       # %bb.0: # %entry  ; MACROFUSION-NEXT:    movl %edi, %eax @@ -126,6 +138,17 @@ define i32 @macrofuse_dec_je(i32 %flags, i8* %p) nounwind {  ; NOFUSION-NEXT:  .LBB3_2: # %if.end  ; NOFUSION-NEXT:    retq  ; +; BRANCHFUSIONONLY-LABEL: macrofuse_dec_je: +; BRANCHFUSIONONLY:       # %bb.0: # %entry +; BRANCHFUSIONONLY-NEXT:    movl %edi, %eax +; BRANCHFUSIONONLY-NEXT:    decl %eax +; BRANCHFUSIONONLY-NEXT:    movb $1, (%rsi) +; BRANCHFUSIONONLY-NEXT:    je .LBB3_2 +; BRANCHFUSIONONLY-NEXT:  # %bb.1: # %if.then +; BRANCHFUSIONONLY-NEXT:    movl $1, %eax +; BRANCHFUSIONONLY-NEXT:  .LBB3_2: # %if.end +; BRANCHFUSIONONLY-NEXT:    retq +;  ; MACROFUSION-LABEL: macrofuse_dec_je:  ; MACROFUSION:       # %bb.0: # %entry  ; MACROFUSION-NEXT:    movl %edi, %eax diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll index 21a1799fc7c..10a725a7ef2 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -3,8 +3,9 @@  ; RUN: opt < %s -loop-reduce -mcpu=bdver2  -S | FileCheck %s --check-prefix=BUL  ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW -; RUN: llc < %s                    | FileCheck %s --check-prefix=BASE -; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s                     | FileCheck %s --check-prefix=BASE +; RUN: llc < %s -mattr=macrofusion  | FileCheck %s --check-prefix=FUSE +; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"  target triple = "x86_64-unknown-unknown"  | 

