[X86] Prefer VPBLENDD for v2i64/v4i64 blends with AVX2.

We were using VPBLENDW for v2i64 and VBLENDPD for v4i64. VPBLENDD has better throughput than VPBLENDW on some CPUs so it makes sense to use it when possible. VBLENDPD will probably become VBLENDD during execution domain fixing, but we might as well use integer in isel while we can. This should work around some issues with the domain fixing pass prefering PBLENDW when we start with PBLENDW. There may still be some v8i16 cases that could use PBLENDD. llvm-svn: 355281
author: Craig Topper <craig.topper@intel.com> 2019-03-03 00:18:07 +0000
committer: Craig Topper <craig.topper@intel.com> 2019-03-03 00:18:07 +0000
commit: ce686597722cecdecf9c33d0cfc3ba7909cd65b1 (patch)
tree: 9866c9b1e57594a92d868deb90f186208cc9685f /llvm/lib/Target
parent: cf03bd92d64a70b81cea962dc5adf15d6ef5ce6f (diff)
download: bcm5719-llvm-ce686597722cecdecf9c33d0cfc3ba7909cd65b1.tar.gz
bcm5719-llvm-ce686597722cecdecf9c33d0cfc3ba7909cd65b1.zip
1 files changed, 37 insertions, 3 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 78ae04e6926..5a94af5fbbc 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6419,6 +6419,17 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{
   return getI8Imm(NewImm, SDLoc(N));
 }]>;
 
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
+def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 2; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0x3 << (i * 2);
+  }
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
 def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
   uint8_t Imm = N->getZExtValue();
@@ -6441,6 +6452,17 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
   return getI8Imm(NewImm ^ 0xff, SDLoc(N));
 }]>;
 
+// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
+def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 2; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0x3 << (i * 2);
+  }
+  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
+}]>;
+
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6553,7 +6575,7 @@ let Predicates = [HasAVX2] in {
 
 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
 // ExecutionDomainFixPass will cleanup domains later on.
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX1Only] in {
 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
           (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
 def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
@@ -6569,9 +6591,7 @@ def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
           (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
           (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
-}
 
-let Predicates = [HasAVX1Only] in {
 def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
           (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
 def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
@@ -7867,6 +7887,20 @@ defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
                                 SchedWriteBlend.YMM, VR256, i256mem,
                                 BlendCommuteImm8>, VEX_L;
+
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
+          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
+          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
+          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
+          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
+          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
+          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
 }
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
author	Craig Topper <craig.topper@intel.com>	2019-03-03 00:18:07 +0000
committer	Craig Topper <craig.topper@intel.com>	2019-03-03 00:18:07 +0000
commit	ce686597722cecdecf9c33d0cfc3ba7909cd65b1 (patch)
tree	9866c9b1e57594a92d868deb90f186208cc9685f /llvm/lib/Target
parent	cf03bd92d64a70b81cea962dc5adf15d6ef5ce6f (diff)
download	bcm5719-llvm-ce686597722cecdecf9c33d0cfc3ba7909cd65b1.tar.gz bcm5719-llvm-ce686597722cecdecf9c33d0cfc3ba7909cd65b1.zip