diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrShiftRotate.td | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/rot32.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/rot64.ll | 9 | 
7 files changed, 44 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 0253a9bcc5b..d1807043350 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -263,6 +263,15 @@ def FeatureFastLZCNT            "fast-lzcnt", "HasFastLZCNT", "true",            "LZCNT instructions are as fast as most simple integer ops">; + +// Sandy Bridge and newer processors can use SHLD with the same source on both +// inputs to implement rotate to avoid the partial flag update of the normal +// rotate instructions. +def FeatureFastSHLDRotate +    : SubtargetFeature< +          "fast-shld-rotate", "HasFastSHLDRotate", "true", +          "SHLD can be used as a faster rotate">; +  //===----------------------------------------------------------------------===//  // X86 processors supported.  //===----------------------------------------------------------------------===// @@ -458,7 +467,8 @@ def SNBFeatures : ProcessorFeatures<[], [    FeatureXSAVE,    FeatureXSAVEOPT,    FeatureLAHFSAHF, -  FeatureFastScalarFSQRT +  FeatureFastScalarFSQRT, +  FeatureFastSHLDRotate  ]>;  class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 8073fe6e726..f7b5e5604a7 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -897,6 +897,7 @@ def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;  def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;  def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;  def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; +def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;  def HasMFence    : Predicate<"Subtarget->hasMFence()">;  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index 8291ba0dc39..b21f0b923da 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,  } // Defs = [EFLAGS] +// Sandy Bridge and newer Intel processors support faster rotates using +// SHLD to avoid a partial flag update on the normal rotate instructions. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { +  def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), +            (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; +  def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), +            (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +} +  def ROT32L2R_imm8  : SDNodeXForm<imm, [{    // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.    return getI8Imm(32 - N->getZExtValue(), SDLoc(N)); diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index dfe22faef02..336db6647a3 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -302,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {    HasFastScalarFSQRT = false;    HasFastVectorFSQRT = false;    HasFastLZCNT = false; +  HasFastSHLDRotate = false;    HasSlowDivide32 = false;    HasSlowDivide64 = false;    PadShortFunctions = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index a82d92c2854..75b87a04e51 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -229,6 +229,9 @@ protected:    /// True if LZCNT instruction is fast.    bool HasFastLZCNT; +  /// True if SHLD based rotate is fast. +  bool HasFastSHLDRotate; +    /// True if the short functions should be padded to prevent    /// a stall when returning too early.    bool PadShortFunctions; @@ -466,6 +469,7 @@ public:    bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }    bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }    bool hasFastLZCNT() const { return HasFastLZCNT; } +  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }    bool hasSlowDivide32() const { return HasSlowDivide32; }    bool hasSlowDivide64() const { return HasSlowDivide64; }    bool padShortFunctions() const { return PadShortFunctions; } diff --git a/llvm/test/CodeGen/X86/rot32.ll b/llvm/test/CodeGen/X86/rot32.ll index 35f4784bc1d..79ecbe0514d 100644 --- a/llvm/test/CodeGen/X86/rot32.ll +++ b/llvm/test/CodeGen/X86/rot32.ll @@ -1,4 +1,5 @@  ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD  ; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2  define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone { @@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {  entry:  ; CHECK-LABEL: xfoo:  ; CHECK: roll $7 +; SHLD-LABEL: xfoo: +; SHLD: shldl $7  ; BMI2-LABEL: xfoo:  ; BMI2: rorxl $25  	%0 = lshr i32 %x, 25 @@ -61,6 +64,8 @@ define i32 @xfoop(i32* %p) nounwind readnone {  entry:  ; CHECK-LABEL: xfoop:  ; CHECK: roll $7 +; SHLD-LABEL: xfoop: +; SHLD: shldl $7  ; BMI2-LABEL: xfoop:  ; BMI2: rorxl $25  	%x = load i32, i32* %p @@ -84,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {  entry:  ; CHECK-LABEL: xun:  ; CHECK: roll $25 +; SHLD-LABEL: xun: +; SHLD: shldl $25  ; BMI2-LABEL: xun:  ; BMI2: rorxl $7  	%0 = lshr i32 %x, 7 @@ -96,6 +103,8 @@ define i32 @xunp(i32* %p) nounwind readnone {  entry:  ; CHECK-LABEL: xunp:  ; CHECK: roll $25 +; shld-label: xunp: +; shld: shldl $25  ; BMI2-LABEL: xunp:  ; BMI2: rorxl $7  	%x = load i32, i32* %p diff --git a/llvm/test/CodeGen/X86/rot64.ll b/llvm/test/CodeGen/X86/rot64.ll index fc382ed7fc6..976acbb0167 100644 --- a/llvm/test/CodeGen/X86/rot64.ll +++ b/llvm/test/CodeGen/X86/rot64.ll @@ -1,4 +1,5 @@  ; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD  ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2  define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone { @@ -49,6 +50,8 @@ define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {  entry:  ; CHECK-LABEL: xfoo:  ; CHECK: rolq $7 +; SHLD-LABEL: xfoo: +; SHLD: shldq $7  ; BMI2-LABEL: xfoo:  ; BMI2: rorxq $57  	%0 = lshr i64 %x, 57 @@ -61,6 +64,8 @@ define i64 @xfoop(i64* %p) nounwind readnone {  entry:  ; CHECK-LABEL: xfoop:  ; CHECK: rolq $7 +; SHLD-LABEL: xfoop: +; SHLD: shldq $7  ; BMI2-LABEL: xfoop:  ; BMI2: rorxq $57  	%x = load i64, i64* %p @@ -84,6 +89,8 @@ define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {  entry:  ; CHECK-LABEL: xun:  ; CHECK: rolq $57 +; SHLD-LABEL: xun: +; SHLD: shldq $57  ; BMI2-LABEL: xun:  ; BMI2: rorxq $7  	%0 = lshr i64 %x, 7 @@ -96,6 +103,8 @@ define i64 @xunp(i64* %p) nounwind readnone {  entry:  ; CHECK-LABEL: xunp:  ; CHECK: rolq $57 +; SHLD-LABEL: xunp: +; SHLD: shldq $57  ; BMI2-LABEL: xunp:  ; BMI2: rorxq $7  	%x = load i64, i64* %p  | 

