[X86] Use 128-bit ops for 256-bit vzmovl patterns.

128-bit ops implicitly zero the upper bits. This should address the comment about domain crossing for the integer version without AVX2 since we can use a 128-bit VBLENDW without AVX2. The only bad thing I see here is that we failed to reuse an vxorps in some of the tests, but I think that's already known issue. llvm-svn: 337134
author: Craig Topper <craig.topper@intel.com> 2018-07-15 18:51:07 +0000
committer: Craig Topper <craig.topper@intel.com> 2018-07-15 18:51:07 +0000
commit: 8f34858779b6b89ed9bdf0a7791405f02c8200e3 (patch)
tree: 5a121b00adbe62ea2e7fa9e6533e8f226febf32f /llvm/lib
parent: 6712b8675bdc7775d1d650325681da7e0a267962 (diff)
download: bcm5719-llvm-8f34858779b6b89ed9bdf0a7791405f02c8200e3.tar.gz
bcm5719-llvm-8f34858779b6b89ed9bdf0a7791405f02c8200e3.zip
1 files changed, 17 insertions, 10 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index eb6a3323491..c2e1a94f408 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6431,19 +6431,26 @@ let Predicates = [HasAVX, OptForSpeed] in {
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (VBLENDPSrri (v4f32 (V_SET0)),
+                          (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm),
+                          (i8 1)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VPBLENDWrri (v4i32 (V_SET0)),
+                          (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm),
+                          (i8 3)), sub_xmm)>;
 
-  // Move low f64 and clear high bits.
   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
-            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
-
-  // These will incur an FP/int domain crossing penalty, but it may be the only
-  // way without AVX2. Do not add any complexity because we may be able to match
-  // more optimal patterns defined earlier in this file.
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (VBLENDPDrri (v2f64 (V_SET0)),
+                          (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm),
+                          (i8 1)), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
-            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0),
+             (VPBLENDWrri (v2i64 (V_SET0)),
+                          (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm),
+                          (i8 0xf)), sub_xmm)>;
 }
 
 // Prefer a movss or movsd over a blendps when optimizing for size. these were
author	Craig Topper <craig.topper@intel.com>	2018-07-15 18:51:07 +0000
committer	Craig Topper <craig.topper@intel.com>	2018-07-15 18:51:07 +0000
commit	8f34858779b6b89ed9bdf0a7791405f02c8200e3 (patch)
tree	5a121b00adbe62ea2e7fa9e6533e8f226febf32f /llvm/lib
parent	6712b8675bdc7775d1d650325681da7e0a267962 (diff)
download	bcm5719-llvm-8f34858779b6b89ed9bdf0a7791405f02c8200e3.tar.gz bcm5719-llvm-8f34858779b6b89ed9bdf0a7791405f02c8200e3.zip