4 files changed, 40 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3e68120041c..da4ca665d83 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -344,10 +344,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     return true;
   }
 
-  uint64_t MaxNopLength = STI.getFeatureBits()[X86::ProcIntelSLM] ? 7 : 15;
-
-  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
-  // needed, then emit a nop of the remaining length.
+  // 15-bytes is the longest single NOP instruction, but 10-bytes is
+  // commonly the longest that can be efficiently decoded.
+  uint64_t MaxNopLength = 10;
+  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+    MaxNopLength = 7;
+  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+    MaxNopLength = 15;
+  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+    MaxNopLength = 11;
+
+  // Emit as many MaxNopLength NOPs as needed, then emit a NOP of the remaining
+  // length.
   do {
     const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength);
     const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 2c064c43d9d..9f0d8aa0d35 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -305,8 +305,16 @@ def FeatureFastLZCNT
     : SubtargetFeature<
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
-
-
+// If the target can efficiently decode NOPs upto 11-bytes in length.
+def FeatureFast11ByteNOP
+    : SubtargetFeature<
+          "fast-11bytenop", "HasFast11ByteNOP", "true",
+          "Target can quickly decode up to 11 byte NOPs">;
+// If the target can efficiently decode NOPs upto 15-bytes in length.
+def FeatureFast15ByteNOP
+    : SubtargetFeature<
+          "fast-15bytenop", "HasFast15ByteNOP", "true",
+          "Target can quickly decode up to 15 byte NOPs">;
 // Sandy Bridge and newer processors can use SHLD with the same source on both
 // inputs to implement rotate to avoid the partial flag update of the normal
 // rotate instructions.
@@ -849,7 +857,8 @@ def : Proc<"btver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureSlowSHLD,
-  FeatureLAHFSAHF
+  FeatureLAHFSAHF,
+  FeatureFast15ByteNOP
 ]>;
 
 // Jaguar
@@ -874,6 +883,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast15ByteNOP,
   FeatureFastPartialYMMorZMMWrite
 ]>;
 
@@ -897,6 +907,7 @@ def : Proc<"bdver1", [
   FeatureLWP,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 // Piledriver
@@ -923,6 +934,7 @@ def : Proc<"bdver2", [
   FeatureFMA,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 
@@ -952,6 +964,7 @@ def : Proc<"bdver3", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMacroFusion
 ]>;
 
@@ -981,6 +994,7 @@ def : Proc<"bdver4", [
   FeatureSlowSHLD,
   FeatureFSGSBase,
   FeatureLAHFSAHF,
+  FeatureFast11ByteNOP,
   FeatureMWAITX,
   FeatureMacroFusion
 ]>;
@@ -1003,6 +1017,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
   FeatureFastLZCNT,
   FeatureLAHFSAHF,
   FeatureLZCNT,
+  FeatureFast15ByteNOP,
   FeatureMacroFusion,
   FeatureMMX,
   FeatureMOVBE,
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 217b22c2bf7..7c4af0aea57 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -335,6 +335,8 @@ void X86Subtarget::initializeEnvironment() {
   HasLZCNTFalseDeps = false;
   HasFastVariableShuffle = false;
   HasFastPartialYMMorZMMWrite = false;
+  HasFast11ByteNOP = false;
+  HasFast15ByteNOP = false;
   HasFastGather = false;
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index e34735bffa5..b8adb63ef03 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -246,6 +246,14 @@ protected:
   /// of a YMM or ZMM register without clearing the upper part.
   bool HasFastPartialYMMorZMMWrite;
 
+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 11 bytes.
+  bool HasFast11ByteNOP;
+
+  /// True if there is no performance penalty for writing NOPs with up to
+  /// 15 bytes.
+  bool HasFast15ByteNOP;
+
   /// True if gather is reasonably fast. This is true for Skylake client and
   /// all AVX-512 CPUs.
   bool HasFastGather;