diff options
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp | 41 | ||||
-rw-r--r-- | llvm/lib/Target/X86/Utils/X86ShuffleDecode.h | 9 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 16 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll | 19 |
4 files changed, 80 insertions, 5 deletions
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index f007349d184..768f688a09d 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -342,7 +342,46 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size"); + + // VPPERM Operation + // Bits[4:0] - Byte Index (0 - 31) + // Bits[7:5] - Permute Operation + // + // Permute Operation: + // 0 - Source byte (no logical operation). + // 1 - Invert source byte. + // 2 - Bit reverse of source byte. + // 3 - Bit reverse of inverted source byte. + // 4 - 00h (zero - fill). + // 5 - FFh (ones - fill). + // 6 - Most significant bit of source byte replicated in all bit positions. + // 7 - Invert most significant bit of source byte and replicate in all bit positions. + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + if (M == (uint64_t)SM_SentinelUndef) { + ShuffleMask.push_back(M); + continue; + } + + uint64_t PermuteOp = (M >> 5) & 0x3; + if (PermuteOp == 4) { + ShuffleMask.push_back(SM_SentinelZero); + continue; + } + if (PermuteOp != 0) { + ShuffleMask.clear(); + return; + } + + uint64_t Index = M & 0x1F; + ShuffleMask.push_back((int)Index); + } +} + + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { for (unsigned i = 0; i != 4; ++i) { diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 0c92b39e07f..4a4d33b3689 100644 --- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -73,7 +73,7 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decodes a PSWAPD 3DNow! instruction. void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); -/// Decodes the shuffle masks for shufp*. +/// Decodes the shuffle masks for shufp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -108,6 +108,13 @@ void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// Decode a VPPERM mask from a raw array of constants such as from +/// BUILD_VECTOR. +/// This can only basic masks (permutes + zeros), not any of the other +/// operations that VPPERM can perform. +void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + /// Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(MVT SrcScalarVT, MVT DstVT, SmallVectorImpl<int> &ShuffleMask); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7f699762262..284925c9030 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3871,6 +3871,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::VPERMI: + case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: @@ -5008,6 +5009,20 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPPERM: { + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + SDValue MaskNode = N->getOperand(2); + SmallVector<uint64_t, 32> RawMask; + if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { + DecodeVPPERMMask(RawMask, Mask); + break; + } + if (auto *C = getTargetShuffleMaskConstant(MaskNode)) { + DecodeVPPERMMask(C, Mask); + break; + } + return false; + } case X86ISD::VPERMV: { IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. @@ -29688,6 +29703,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VPPERM: case X86ISD::VPERMV3: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 91655da36fb..b87038a0801 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -13,18 +13,31 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>) %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) ret <16 x i8> %res1 } +define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: combine_vpperm_identity_bitcast: +; CHECK: # BB#0: +; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %mask = bitcast <2 x i64> <i64 1084818905618843912, i64 506097522914230528> to <16 x i8> + %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %mask) + %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> %mask) + %res2 = bitcast <16 x i8> %res1 to <2 x i64> + %res3 = add <2 x i64> %res2, <i64 1084818905618843912, i64 506097522914230528> + %res4 = bitcast <2 x i64> %res3 to <16 x i8> + ret <16 x i8> %res4 +} + define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: combine_vpperm_as_unary_unpckhwd: ; CHECK: # BB#0: -; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: retq %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>) ret <16 x i8> %res0 |