diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-29 21:24:31 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-29 21:24:31 +0000 |
| commit | 02bdac53e75aa3bff67d3320d29597c1188d641b (patch) | |
| tree | bac3bbc9f526876f5e57e0c513f376ce14d7ad99 /llvm/lib/Target | |
| parent | 08464524c34daa350ba4eaafd6231ddc1c3edee0 (diff) | |
| download | bcm5719-llvm-02bdac53e75aa3bff67d3320d29597c1188d641b.tar.gz bcm5719-llvm-02bdac53e75aa3bff67d3320d29597c1188d641b.zip | |
[X86] Emit 11-byte or 15-byte NOPs on recent AMD targets, else default to 10-byte NOPs (PR22965)
We currently emit up to 15-byte NOPs on all targets (apart from Silvermont), which stalls performance on some targets with decoders that struggle with 2 or 3 more '66' prefixes.
This patch flags recent AMD targets (btver1/znver1) to still emit 15-byte NOPs and bdver* targets to emit 11-byte NOPs. All other targets now emit 10-byte NOPs apart from SilverMont CPUs which still emit 7-byte NOPS.
Differential Revision: https://reviews.llvm.org/D42616
llvm-svn: 323693
Diffstat (limited to 'llvm/lib/Target')
| -rw-r--r-- | llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86.td | 21 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 8 |
4 files changed, 40 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3e68120041c..da4ca665d83 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -344,10 +344,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { return true; } - uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15; - - // 15 is the longest single nop instruction. Emit as many 15-byte nops as - // needed, then emit a nop of the remaining length. + // 15-bytes is the longest single NOP instruction, but 10-bytes is + // commonly the longest that can be efficiently decoded. + uint64_t MaxNopLength = 10; + if (STI.getFeatureBits()[X86::ProcIntelSLM]) + MaxNopLength = 7; + else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + MaxNopLength = 15; + else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + MaxNopLength = 11; + + // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining + // length. do { const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 2c064c43d9d..9f0d8aa0d35 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -305,8 +305,16 @@ def FeatureFastLZCNT : SubtargetFeature< "fast-lzcnt", "HasFastLZCNT", "true", "LZCNT instructions are as fast as most simple integer ops">; - - +// If the target can efficiently decode NOPs upto 11-bytes in length. +def FeatureFast11ByteNOP + : SubtargetFeature< + "fast-11bytenop", "HasFast11ByteNOP", "true", + "Target can quickly decode up to 11 byte NOPs">; +// If the target can efficiently decode NOPs upto 15-bytes in length. +def FeatureFast15ByteNOP + : SubtargetFeature< + "fast-15bytenop", "HasFast15ByteNOP", "true", + "Target can quickly decode up to 15 byte NOPs">; // Sandy Bridge and newer processors can use SHLD with the same source on both // inputs to implement rotate to avoid the partial flag update of the normal // rotate instructions. @@ -849,7 +857,8 @@ def : Proc<"btver1", [ FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFast15ByteNOP ]>; // Jaguar @@ -874,6 +883,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureLAHFSAHF, + FeatureFast15ByteNOP, FeatureFastPartialYMMorZMMWrite ]>; @@ -897,6 +907,7 @@ def : Proc<"bdver1", [ FeatureLWP, FeatureSlowSHLD, FeatureLAHFSAHF, + FeatureFast11ByteNOP, FeatureMacroFusion ]>; // Piledriver @@ -923,6 +934,7 @@ def : Proc<"bdver2", [ FeatureFMA, FeatureSlowSHLD, FeatureLAHFSAHF, + FeatureFast11ByteNOP, FeatureMacroFusion ]>; @@ -952,6 +964,7 @@ def : Proc<"bdver3", [ FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, + FeatureFast11ByteNOP, FeatureMacroFusion ]>; @@ -981,6 +994,7 @@ def : Proc<"bdver4", [ FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, + FeatureFast11ByteNOP, FeatureMWAITX, FeatureMacroFusion ]>; @@ -1003,6 +1017,7 @@ def: ProcessorModel<"znver1", Znver1Model, [ FeatureFastLZCNT, FeatureLAHFSAHF, FeatureLZCNT, + FeatureFast15ByteNOP, FeatureMacroFusion, FeatureMMX, FeatureMOVBE, diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 217b22c2bf7..7c4af0aea57 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -335,6 +335,8 @@ void X86Subtarget::initializeEnvironment() { HasLZCNTFalseDeps = false; HasFastVariableShuffle = false; HasFastPartialYMMorZMMWrite = false; + HasFast11ByteNOP = false; + HasFast15ByteNOP = false; HasFastGather = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index e34735bffa5..b8adb63ef03 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -246,6 +246,14 @@ protected: /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; + /// True if there is no performance penalty for writing NOPs with up to + /// 11 bytes. + bool HasFast11ByteNOP; + + /// True if there is no performance penalty for writing NOPs with up to + /// 15 bytes. + bool HasFast15ByteNOP; + /// True if gather is reasonably fast. This is true for Skylake client and /// all AVX-512 CPUs. bool HasFastGather; |

