[X86] Prefer blendi over movss/sd when avx512 is enabled unless optimizing for size.

AVX512 doesn't have an immediate controlled blend instruction. But blend throughput is still better than movss/sd on SKX. This commit changes AVX512 to use the AVX blend instructions instead of MOVSS/MOVSD. This constrains the register allocation since it won't be able to use XMM16-31, but hopefully the increased throughput and reduced port 5 pressure makes up for that. llvm-svn: 337083
author: Craig Topper <craig.topper@intel.com> 2018-07-14 02:05:08 +0000
committer: Craig Topper <craig.topper@intel.com> 2018-07-14 02:05:08 +0000
commit: f0b164415c383be9d20031d52cf5db9ae71cded8 (patch)
tree: 26beee3fa7ce875eab5f2aa1d6b23cab85871f70 /llvm/lib
parent: 70993d37e8dd2031b623071372d7aebefb18c74c (diff)
download: bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.tar.gz
bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.zip
2 files changed, 16 insertions, 11 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 3df4da13cd5..bfc26258ce4 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3936,6 +3936,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode,
                               X86VectorVTInfo _> {
+  let Predicates = [HasAVX512, OptForSize] in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -4324,7 +4325,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                 (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
                                  VR128X:$src1, VR128X:$src2), 0>;
 
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512, OptForSize] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
             (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4339,6 +4340,17 @@ let Predicates = [HasAVX512] in {
             (SUBREG_TO_REG (i32 0),
              (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
   def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
              (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
@@ -4405,18 +4417,11 @@ let Predicates = [HasAVX512] in {
             (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
-  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
              (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
 
-  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
-                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
             (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
@@ -4425,7 +4430,9 @@ let Predicates = [HasAVX512] in {
   def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+}
 
+let Predicates = [HasAVX512, OptForSize] in {
   // Shuffle with VMOVSS
   def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
             (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 8f195847528..5c8b612f49b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6394,8 +6394,7 @@ let Predicates = [HasAVX2] in {
 // Prefer a movss or movsd over a blendps when optimizing for size. these were
 // changed to use blends because blends have better throughput on sandybridge
 // and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseAVX] in {
-  let Predicates = [UseAVX, OptForSpeed] in {
+let Predicates = [HasAVX, OptForSpeed] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
@@ -6410,7 +6409,6 @@ let Predicates = [UseAVX] in {
             (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
             (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
-  }
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
author	Craig Topper <craig.topper@intel.com>	2018-07-14 02:05:08 +0000
committer	Craig Topper <craig.topper@intel.com>	2018-07-14 02:05:08 +0000
commit	f0b164415c383be9d20031d52cf5db9ae71cded8 (patch)
tree	26beee3fa7ce875eab5f2aa1d6b23cab85871f70 /llvm/lib
parent	70993d37e8dd2031b623071372d7aebefb18c74c (diff)
download	bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.tar.gz bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.zip