[X86][XOP] Add support for the matching of the VPCMOV bit select instruction

XOP has the VPCMOV instruction that performs the common vector bit select operation OR( AND( SRC1, SRC3 ), AND( SRC2, ~SRC3 ) ) This patch adds tablegen pattern matching for this instruction. Differential Revision: http://reviews.llvm.org/D8841 llvm-svn: 251975
author: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-11-03 20:27:01 +0000
committer: Simon Pilgrim <llvm-dev@redking.me.uk> 2015-11-03 20:27:01 +0000
commit: e88dc04c4854124c6cf4c2d0fc554029ea190846 (patch)
tree: 58d9c047e0b7c6f1414b269108364704a1b89962 /llvm
parent: f4acad30ec08c6fd66d63fa09f300c55c4afd90b (diff)
download: bcm5719-llvm-e88dc04c4854124c6cf4c2d0fc554029ea190846.tar.gz
bcm5719-llvm-e88dc04c4854124c6cf4c2d0fc554029ea190846.zip
4 files changed, 185 insertions, 3 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 9de149559df..12c354c89b2 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -200,6 +200,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx2.pblendd.128" ||
         Name == "x86.avx2.pblendd.256" ||
         Name == "x86.avx2.vbroadcasti128" ||
+        Name == "x86.xop.vpcmov" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = nullptr;
       return true;
@@ -457,6 +458,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
+    } else if (Name == "llvm.x86.xop.vpcmov") {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Sel = CI->getArgOperand(2);
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1));
+      Value *NotSel = Builder.CreateXor(Sel, MinusOne);
+      Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
+      Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
+      Rep = Builder.CreateOr(Sel0, Sel1);
     } else if (Name == "llvm.x86.sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
                                                Intrinsic::x86_sse42_crc32_32_8);
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index df9d906f4d9..4cb2304e464 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -281,6 +281,16 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
 let ExeDomain = SSEPackedInt in
   defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
+let Predicates = [HasXOP] in {
+  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+                       (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+                       (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+}
+
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
   def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
diff --git a/llvm/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index e96fed36d15..3b4c6ea1210 100644
--- a/llvm/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/llvm/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -61,15 +61,14 @@ define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float>
 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
 
 define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
-  ; CHECK: vpcmov
+  ; CHECK: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
   %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
-  ; CHECK: vpcmov
-  ; CHECK: ymm
+  ; CHECK: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
   %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
   ret <4 x i64> %res
 }
diff --git a/llvm/test/CodeGen/X86/xop-pcmov.ll b/llvm/test/CodeGen/X86/xop-pcmov.ll
new file mode 100644
index 00000000000..165d4a7232d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/xop-pcmov.ll
@@ -0,0 +1,162 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s
+
+define <4 x double> @pcmov_4f64(<4 x double> %a, <4 x double> %b, <4 x double> %m) {
+; CHECK-LABEL: pcmov_4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <4 x double> %m to <4 x i64>
+  %2 = bitcast <4 x double> %a to <4 x i64>
+  %3 = and <4 x i64> %1, %2
+  %4 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %5 = bitcast <4 x double> %b to <4 x i64>
+  %6 = and <4 x i64> %4, %5
+  %7 = or <4 x i64> %3, %6
+  %8 = bitcast <4 x i64> %7 to <4 x double>
+  ret <4 x double> %8
+}
+
+define <2 x double> @pcmov_2f64(<2 x double> %a, <2 x double> %b, <2 x double> %m) {
+; CHECK-LABEL: pcmov_2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <2 x double> %m to <2 x i64>
+  %2 = bitcast <2 x double> %a to <2 x i64>
+  %3 = and <2 x i64> %1, %2
+  %4 = xor <2 x i64> %1, <i64 -1, i64 -1>
+  %5 = bitcast <2 x double> %b to <2 x i64>
+  %6 = and <2 x i64> %4, %5
+  %7 = or <2 x i64> %3, %6
+  %8 = bitcast <2 x i64> %7 to <2 x double>
+  ret <2 x double> %8
+}
+
+define <8 x float> @pcmov_8f32(<8 x float> %a, <8 x float> %b, <8 x float> %m) {
+; CHECK-LABEL: pcmov_8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <8 x float> %m to <8 x i32>
+  %2 = bitcast <8 x float> %a to <8 x i32>
+  %3 = and <8 x i32> %1, %2
+  %4 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = bitcast <8 x float> %b to <8 x i32>
+  %6 = and <8 x i32> %4, %5
+  %7 = or <8 x i32> %3, %6
+  %8 = bitcast <8 x i32> %7 to <8 x float>
+  ret <8 x float> %8
+}
+
+define <4 x float> @pcmov_4f32(<4 x float> %a, <4 x float> %b, <4 x float> %m) {
+; CHECK-LABEL: pcmov_4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = bitcast <4 x float> %m to <4 x i32>
+  %2 = bitcast <4 x float> %a to <4 x i32>
+  %3 = and <4 x i32> %1, %2
+  %4 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = bitcast <4 x float> %b to <4 x i32>
+  %6 = and <4 x i32> %4, %5
+  %7 = or <4 x i32> %3, %6
+  %8 = bitcast <4 x i32> %7 to <4 x float>
+  ret <4 x float> %8
+}
+
+define <4 x i64> @pcmov_4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %m) {
+; CHECK-LABEL: pcmov_4i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <4 x i64> %a, %m
+  %2 = xor <4 x i64> %m, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %3 = and <4 x i64> %b, %2
+  %4 = or <4 x i64> %1, %3
+  ret <4 x i64> %4
+}
+
+define <2 x i64> @pcmov_2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %m) {
+; CHECK-LABEL: pcmov_2i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <2 x i64> %a, %m
+  %2 = xor <2 x i64> %m, <i64 -1, i64 -1>
+  %3 = and <2 x i64> %b, %2
+  %4 = or <2 x i64> %1, %3
+  ret <2 x i64> %4
+}
+
+define <8 x i32> @pcmov_8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %m) {
+; CHECK-LABEL: pcmov_8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <8 x i32> %a, %m
+  %2 = xor <8 x i32> %m, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <8 x i32> %b, %2
+  %4 = or <8 x i32> %1, %3
+  ret <8 x i32> %4
+}
+
+define <4 x i32> @pcmov_4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %m) {
+; CHECK-LABEL: pcmov_4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <4 x i32> %a, %m
+  %2 = xor <4 x i32> %m, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %b, %2
+  %4 = or <4 x i32> %1, %3
+  ret <4 x i32> %4
+}
+
+define <16 x i16> @pcmov_16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %m) {
+; CHECK-LABEL: pcmov_16i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <16 x i16> %a, %m
+  %2 = xor <16 x i16> %m, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <16 x i16> %b, %2
+  %4 = or <16 x i16> %1, %3
+  ret <16 x i16> %4
+}
+
+define <8 x i16> @pcmov_8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %m) {
+; CHECK-LABEL: pcmov_8i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <8 x i16> %a, %m
+  %2 = xor <8 x i16> %m, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <8 x i16> %b, %2
+  %4 = or <8 x i16> %1, %3
+  ret <8 x i16> %4
+}
+
+define <32 x i8> @pcmov_32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) {
+; CHECK-LABEL: pcmov_32i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = and <32 x i8> %a, %m
+  %2 = xor <32 x i8> %m, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <32 x i8> %b, %2
+  %4 = or <32 x i8> %1, %3
+  ret <32 x i8> %4
+}
+
+define <16 x i8> @pcmov_16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) {
+; CHECK-LABEL: pcmov_16i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = and <16 x i8> %a, %m
+  %2 = xor <16 x i8> %m, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %b, %2
+  %4 = or <16 x i8> %1, %3
+  ret <16 x i8> %4
+}
author	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-11-03 20:27:01 +0000
committer	Simon Pilgrim <llvm-dev@redking.me.uk>	2015-11-03 20:27:01 +0000
commit	e88dc04c4854124c6cf4c2d0fc554029ea190846 (patch)
tree	58d9c047e0b7c6f1414b269108364704a1b89962 /llvm
parent	f4acad30ec08c6fd66d63fa09f300c55c4afd90b (diff)
download	bcm5719-llvm-e88dc04c4854124c6cf4c2d0fc554029ea190846.tar.gz bcm5719-llvm-e88dc04c4854124c6cf4c2d0fc554029ea190846.zip