summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-01-29 21:24:31 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-01-29 21:24:31 +0000
commit02bdac53e75aa3bff67d3320d29597c1188d641b (patch)
treebac3bbc9f526876f5e57e0c513f376ce14d7ad99 /llvm/lib/Target
parent08464524c34daa350ba4eaafd6231ddc1c3edee0 (diff)
downloadbcm5719-llvm-02bdac53e75aa3bff67d3320d29597c1188d641b.tar.gz
bcm5719-llvm-02bdac53e75aa3bff67d3320d29597c1188d641b.zip
[X86] Emit 11-byte or 15-byte NOPs on recent AMD targets, else default to 10-byte NOPs (PR22965)
We currently emit up to 15-byte NOPs on all targets (apart from Silvermont), which stalls performance on some targets with decoders that struggle with 2 or 3 more '66' prefixes. This patch flags recent AMD targets (btver1/znver1) to still emit 15-byte NOPs and bdver* targets to emit 11-byte NOPs. All other targets now emit 10-byte NOPs apart from SilverMont CPUs which still emit 7-byte NOPS. Differential Revision: https://reviews.llvm.org/D42616 llvm-svn: 323693
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp16
-rw-r--r--llvm/lib/Target/X86/X86.td21
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp2
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h8
4 files changed, 40 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3e68120041c..da4ca665d83 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -344,10 +344,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
return true;
}
- uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15;
-
- // 15 is the longest single nop instruction. Emit as many 15-byte nops as
- // needed, then emit a nop of the remaining length.
+ // 15-bytes is the longest single NOP instruction, but 10-bytes is
+ // commonly the longest that can be efficiently decoded.
+ uint64_t MaxNopLength = 10;
+ if (STI.getFeatureBits()[X86::ProcIntelSLM])
+ MaxNopLength = 7;
+ else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ MaxNopLength = 15;
+ else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ MaxNopLength = 11;
+
+ // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+ // length.
do {
const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 2c064c43d9d..9f0d8aa0d35 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -305,8 +305,16 @@ def FeatureFastLZCNT
: SubtargetFeature<
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
-
-
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+ : SubtargetFeature<
+ "fast-11bytenop", "HasFast11ByteNOP", "true",
+ "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+ : SubtargetFeature<
+ "fast-15bytenop", "HasFast15ByteNOP", "true",
+ "Target can quickly decode up to 15 byte NOPs">;
// Sandy Bridge and newer processors can use SHLD with the same source on both
// inputs to implement rotate to avoid the partial flag update of the normal
// rotate instructions.
@@ -849,7 +857,8 @@ def : Proc<"btver1", [
FeatureLZCNT,
FeaturePOPCNT,
FeatureSlowSHLD,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureFast15ByteNOP
]>;
// Jaguar
@@ -874,6 +883,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
FeatureXSAVEOPT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast15ByteNOP,
FeatureFastPartialYMMorZMMWrite
]>;
@@ -897,6 +907,7 @@ def : Proc<"bdver1", [
FeatureLWP,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
// Piledriver
@@ -923,6 +934,7 @@ def : Proc<"bdver2", [
FeatureFMA,
FeatureSlowSHLD,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
@@ -952,6 +964,7 @@ def : Proc<"bdver3", [
FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMacroFusion
]>;
@@ -981,6 +994,7 @@ def : Proc<"bdver4", [
FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
+ FeatureFast11ByteNOP,
FeatureMWAITX,
FeatureMacroFusion
]>;
@@ -1003,6 +1017,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
FeatureFastLZCNT,
FeatureLAHFSAHF,
FeatureLZCNT,
+ FeatureFast15ByteNOP,
FeatureMacroFusion,
FeatureMMX,
FeatureMOVBE,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 217b22c2bf7..7c4af0aea57 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -335,6 +335,8 @@ void X86Subtarget::initializeEnvironment() {
HasLZCNTFalseDeps = false;
HasFastVariableShuffle = false;
HasFastPartialYMMorZMMWrite = false;
+ HasFast11ByteNOP = false;
+ HasFast15ByteNOP = false;
HasFastGather = false;
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index e34735bffa5..b8adb63ef03 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -246,6 +246,14 @@ protected:
/// of a YMM or ZMM register without clearing the upper part.
bool HasFastPartialYMMorZMMWrite;
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 11 bytes.
+ bool HasFast11ByteNOP;
+
+ /// True if there is no performance penalty for writing NOPs with up to
+ /// 15 bytes.
+ bool HasFast15ByteNOP;
+
/// True if gather is reasonably fast. This is true for Skylake client and
/// all AVX-512 CPUs.
bool HasFastGather;
OpenPOWER on IntegriCloud