AVX-512: Implemented DAG lowering for shuff62x2/shufi62x2 instuctions ( Shuffle Packed Values at 128-bit Granularity )

Tests added , vector-shuffle-512-v8.ll test re-generated. Differential Revision: http://reviews.llvm.org/D10300 llvm-svn: 239697
author: Igor Breger <igor.breger@intel.com> 2015-06-14 13:07:47 +0000
committer: Igor Breger <igor.breger@intel.com> 2015-06-14 13:07:47 +0000
commit: 5e49697138f0015d80939318cfe2b46bebeeccf0 (patch)
tree: e9745879d20676be25cb4894e49aade439794a68
parent: ce1ce989e2b2fc3b992109f82b31a403db015ec6 (diff)
download: bcm5719-llvm-5e49697138f0015d80939318cfe2b46bebeeccf0.tar.gz
bcm5719-llvm-5e49697138f0015d80939318cfe2b46bebeeccf0.zip
3 files changed, 88 insertions, 4 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9f1415dbdf1..6228e4a9672 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9383,6 +9383,30 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
+/// \brief Handle lowering 4-lane 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> WidenedMask,
+                                        SelectionDAG &DAG) {
+
+  assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!");
+  // form a 128-bit permutation.
+  // convert the 64-bit shuffle mask selection values into 128-bit selection
+  // bits defined by a vshuf64x2 instruction's immediate control byte.
+  unsigned PermMask = 0, Imm = 0;
+
+  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+    if(WidenedMask[i] == SM_SentinelZero)
+      return SDValue();
+
+    // use first element in place of undef musk
+    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+    PermMask |= (Imm % 4) << (i * 2);
+  }
+
+  return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
 /// shuffling each lane.
 ///
@@ -10176,6 +10200,10 @@ static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG))
+      return Op;
   // X86 has dedicated unpack instructions that can handle specific blend
   // operations: UNPCKH and UNPCKL.
   if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
diff --git a/llvm/test/CodeGen/X86/avx512-shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffle.ll
index 2683d6fe238..7e9eda58737 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffle.ll
@@ -116,10 +116,10 @@ define <16 x i32> @test15(<16 x i32> %a) {
  ret <16 x i32> %b
 }
 ; CHECK-LABEL: test16
-; CHECK: valignq $2, %zmm0, %zmm1
+; CHECK: valignq $3, %zmm0, %zmm1
 ; CHECK: ret
 define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
   ret <8 x double> %c
 }
 
@@ -252,6 +252,62 @@ define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
   ret <8 x double> %c
 }
 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $136, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1,  i32 4, i32 5>
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwq %xmm2, %zmm1
+; CHECK-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1,  i32 4, i32 5>
+  %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_vshufi64x2_512_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovsxwq %xmm2, %zmm1
+; CHECK-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT:    vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5,  i32 4, i32 5>
+  %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
+  ret <8 x i64> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
+; CHECK-LABEL: test_vshuff64x2_512_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $40, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x1   = load <8 x double>,<8 x double> *%ptr,align 1
+  %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5,  i32 0, i32 1>
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
+; CHECK-LABEL: test_vshuff32x4_512_mem:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vshuff64x2 $20, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x1   = load <16 x float>,<16 x float> *%ptr,align 1
+  %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x float> %res
+}
+
 define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; CHECK-LABEL: test_align_v16i32_rr:
 ; CHECK:       ## BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 8dc76231856..2c6c8a3e7ad 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -88,7 +88,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_01014545:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermpd $68, %zmm0, %zmm0
+; ALL-NEXT:    vshuff64x2 $160, %zmm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
   ret <8 x double> %shuffle
@@ -650,7 +650,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_01014545:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermq $68, %zmm0, %zmm0
+; ALL-NEXT:    vshufi64x2 $160, %zmm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
   ret <8 x i64> %shuffle
author	Igor Breger <igor.breger@intel.com>	2015-06-14 13:07:47 +0000
committer	Igor Breger <igor.breger@intel.com>	2015-06-14 13:07:47 +0000
commit	5e49697138f0015d80939318cfe2b46bebeeccf0 (patch)
tree	e9745879d20676be25cb4894e49aade439794a68
parent	ce1ce989e2b2fc3b992109f82b31a403db015ec6 (diff)
download	bcm5719-llvm-5e49697138f0015d80939318cfe2b46bebeeccf0.tar.gz bcm5719-llvm-5e49697138f0015d80939318cfe2b46bebeeccf0.zip