[X86][SSE] Use pblendw for v4i32/v2i64 during isel.

Summary: Previously we used BLENDPS/BLENDPD but that puts the blend in the FP domain. Under optsize, the two address instruction pass can cause blendps/blendpd to commute to blendps/blendpd. But we probably shouldn't do that if the original type was a integer. So use pblendw instead. Reviewers: spatel, RKSimon Reviewed By: RKSimon Subscribers: jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D58574 llvm-svn: 354755
author: Craig Topper <craig.topper@intel.com> 2019-02-24 19:23:41 +0000
committer: Craig Topper <craig.topper@intel.com> 2019-02-24 19:23:41 +0000
commit: 5532a9873765e1f068a1fd85f2bba803dc23c378 (patch)
tree: 443c272993f268a1a02453af933f156f1c0fc28e /llvm/lib/Target/X86
parent: ce2bd19c49bf7541cb8c1a072332db0f9cc277d3 (diff)
download: bcm5719-llvm-5532a9873765e1f068a1fd85f2bba803dc23c378.tar.gz
bcm5719-llvm-5532a9873765e1f068a1fd85f2bba803dc23c378.zip
1 files changed, 63 insertions, 13 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index e61666781b0..7f085183875 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6397,6 +6397,50 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
   return getI8Imm(Imm ^ 0xff, SDLoc(N));
 }]>;
 
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm4 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0x3 << (i * 2);
+  }
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
+def BlendScaleImm2 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 2; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0xf << (i * 4);
+  }
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
+// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0x3 << (i * 2);
+  }
+  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
+// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
+def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+  uint8_t Imm = N->getZExtValue();
+  uint8_t NewImm = 0;
+  for (unsigned i = 0; i != 2; ++i) {
+    if (Imm & (1 << i))
+      NewImm |= 0xf << (i * 4);
+  }
+  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
+}]>;
+
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6507,7 +6551,7 @@ let Predicates = [HasAVX2] in {
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
-// Emulate vXi32/vXi64 blends with vXf32/vXf64.
+// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
 // ExecutionDomainFixPass will cleanup domains later on.
 let Predicates = [HasAVX] in {
 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
@@ -6517,12 +6561,14 @@ def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
           (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
 
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movsd via commuting under optsize.
 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
-          (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
+          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
-          (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
-          (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
 }
 
 let Predicates = [HasAVX1Only] in {
@@ -6533,12 +6579,14 @@ def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
           (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
 
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
-          (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
+          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
-          (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
-          (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
@@ -6552,19 +6600,21 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
                                SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 let Predicates = [UseSSE41] in {
+// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
+// it from becoming movss via commuting under optsize.
 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
-          (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
+          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
-          (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
-          (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
 
 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
-          (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
+          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
-          (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
-          (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
 }
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
author	Craig Topper <craig.topper@intel.com>	2019-02-24 19:23:41 +0000
committer	Craig Topper <craig.topper@intel.com>	2019-02-24 19:23:41 +0000
commit	5532a9873765e1f068a1fd85f2bba803dc23c378 (patch)
tree	443c272993f268a1a02453af933f156f1c0fc28e /llvm/lib/Target/X86
parent	ce2bd19c49bf7541cb8c1a072332db0f9cc277d3 (diff)
download	bcm5719-llvm-5532a9873765e1f068a1fd85f2bba803dc23c378.tar.gz bcm5719-llvm-5532a9873765e1f068a1fd85f2bba803dc23c378.zip