Use broadcasts to optimize overall size when loading constant splat vectors (x86-64 with AVX or AVX2).

We generate broadcast instructions on CPUs with AVX2 to load some constant splat vectors. This patch should preserve all existing behavior with regular optimization levels, but also use splats whenever possible when optimizing for *size* on any CPU with AVX or AVX2. The tradeoff is up to 5 extra instruction bytes for the broadcast instruction to save at least 8 bytes (up to 31 bytes) of constant pool data. Differential Revision: http://reviews.llvm.org/D5347 llvm-svn: 218263
author: Sanjay Patel <spatel@rotateright.com> 2014-09-22 18:54:01 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2014-09-22 18:54:01 +0000
commit: 7939d7229d374b31c6237718d69c3c0bfa0846a8 (patch)
tree: d6f8518ad25a9dda028cb7c85519d291a8147729 /llvm/lib
parent: 869c0019b11546fc1ed191834cd5d2d4ae9ffc2a (diff)
download: bcm5719-llvm-7939d7229d374b31c6237718d69c3c0bfa0846a8.tar.gz
bcm5719-llvm-7939d7229d374b31c6237718d69c3c0bfa0846a8.zip
2 files changed, 33 insertions, 7 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b610b4e29f..f67eb96ade7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5996,7 +5996,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 /// or SDValue() otherwise.
 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
                                     SelectionDAG &DAG) {
-  if (!Subtarget->hasFp256())
+  // VBROADCAST requires AVX.
+  // TODO: Splats could be generated for non-AVX CPUs using SSE
+  // instructions, but there's less potential gain for only 128-bit vectors.
+  if (!Subtarget->hasAVX())
     return SDValue();
 
   MVT VT = Op.getSimpleValueType();
@@ -6073,17 +6076,34 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
     }
   }
 
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   bool IsGE256 = (VT.getSizeInBits() >= 256);
 
-  // Handle the broadcasting a single constant scalar from the constant pool
-  // into a vector. On Sandybridge it is still better to load a constant vector
+  // When optimizing for size, generate up to 5 extra bytes for a broadcast
+  // instruction to save 8 or more bytes of constant pool data.
+  // TODO: If multiple splats are generated to load the same constant,
+  // it may be detrimental to overall size. There needs to be a way to detect
+  // that condition to know if this is truly a size win.
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool OptForSize = F->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  // Handle broadcasting a single constant scalar from the constant pool
+  // into a vector.
+  // On Sandybridge (no AVX2), it is still better to load a constant vector
   // from the constant pool and not to broadcast it from a scalar.
-  if (ConstSplatVal && Subtarget->hasInt256()) {
+  // But override that restriction when optimizing for size.
+  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+  if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
-    unsigned ScalarSize = CVT.getSizeInBits();
 
-    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
+    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+    // For size optimization, also splat v2f64 and v2i64, and for size opt
+    // with AVX2, also splat i8 and i16.
+    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -6104,7 +6124,6 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   }
 
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget->hasInt256() &&
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 9030119edef..b2ffcb8f346 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5290,6 +5290,13 @@ let Predicates = [HasAVX] in {
             (VMOVDDUPYrr VR256:$src)>;
 }
 
+let Predicates = [UseAVX, OptForSize] in {
+  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  (VMOVDDUPrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  (VMOVDDUPrm addr:$src)>;
+}
+
 let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
author	Sanjay Patel <spatel@rotateright.com>	2014-09-22 18:54:01 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2014-09-22 18:54:01 +0000
commit	7939d7229d374b31c6237718d69c3c0bfa0846a8 (patch)
tree	d6f8518ad25a9dda028cb7c85519d291a8147729 /llvm/lib
parent	869c0019b11546fc1ed191834cd5d2d4ae9ffc2a (diff)
download	bcm5719-llvm-7939d7229d374b31c6237718d69c3c0bfa0846a8.tar.gz bcm5719-llvm-7939d7229d374b31c6237718d69c3c0bfa0846a8.zip