diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-28 08:08:15 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-06-28 08:08:15 +0000 |
commit | c15d2178313a3b904158a472e4aa1f118e040395 (patch) | |
tree | c4106e39019e725374e21f045e25343eb5a59c90 | |
parent | 36b2c0a683ccfafbaad948ced64b6912aa5bf826 (diff) | |
download | bcm5719-llvm-c15d2178313a3b904158a472e4aa1f118e040395.tar.gz bcm5719-llvm-c15d2178313a3b904158a472e4aa1f118e040395.zip |
[X86][SSE] Added support for combining target shuffles to (V)PSHUFD/VPERMILPD/VPERMILPS immediate permutes
This patch allows target shuffles to be combined to single input immediate permute instructions - (V)PSHUFD/VPERMILPD/VPERMILPS - allowing more general pattern matching than what we current do and improves the likelihood of memory folding compared to existing patterns which tend to reuse the input in multiple arguments.
Further permute instructions (V)PSHUFLW/(V)PSHUFHW/(V)PERMQ/(V)PERMPD may be added in the future but its proven tricky to create tests cases for them so far. (V)PSHUFLW/(V)PSHUFHW is already handled quite well in combineTargetShuffle so it may be that removing some of that code may allow us to perform more of the combining in one place without duplication.
Differential Revision: http://reviews.llvm.org/D21148
llvm-svn: 273999
22 files changed, 218 insertions, 117 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f9ebaea4d12..a5ee5372e99 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7148,8 +7148,7 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, - SelectionDAG &DAG) { +static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); @@ -7161,7 +7160,12 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; - return DAG.getConstant(Imm, DL, MVT::i8); + return Imm; +} + +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, + SelectionDAG &DAG) { + return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. @@ -24529,7 +24533,8 @@ static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG, static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT) { - bool FloatDomain = SrcVT.isFloatingPoint(); + bool FloatDomain = SrcVT.isFloatingPoint() || + (!Subtarget.hasAVX2() && SrcVT.is256BitVector()); // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction. if (!FloatDomain && SrcVT.is128BitVector() && @@ -24607,6 +24612,83 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, return false; } +// Attempt to match a combined shuffle mask against supported unary immediate +// permute instructions. +// TODO: Investigate sharing more of this with shuffle lowering. +static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + unsigned &Shuffle, MVT &ShuffleVT, + unsigned &PermuteImm) { + // Ensure we don't contain any zero elements. + for (int M : Mask) { + if (M == SM_SentinelZero) + return false; + assert(SM_SentinelUndef <= M && M < (int)Mask.size() && + "Expected unary shuffle"); + } + + // We only support permutation of 32/64 bit elements. + // TODO - support PSHUFLW/PSHUFHW. + unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size(); + if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) + return false; + MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); + + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + bool FloatDomain = SrcVT.isFloatingPoint(); + if (FloatDomain && !Subtarget.hasAVX()) + return false; + + // Pre-AVX2 we must use float shuffles on 256-bit vectors. + if (SrcVT.is256BitVector() && !Subtarget.hasAVX2()) + FloatDomain = true; + + // TODO - support LaneCrossing for AVX2 PERMQ/PERMPD + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) + return false; + + // VPERMILPD can permute with a non-repeating shuffle. + if (FloatDomain && MaskScalarSizeInBits == 64) { + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } + return true; + } + + // We need a repeating shuffle mask for VPERMILPS/PSHUFD. + SmallVector<int, 4> RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) + return false; + + // Narrow the repeated mask for 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) { + WordMask.clear(); + for (int M : RepeatedMask) { + if (M == SM_SentinelUndef) { + WordMask.append(2, SM_SentinelUndef); + continue; + } + WordMask.push_back((M * 2) + 0); + WordMask.push_back((M * 2) + 1); + } + } + + Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; +} + // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. @@ -24708,7 +24790,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, // Attempt to match the mask against known shuffle patterns. MVT ShuffleVT; - unsigned Shuffle; + unsigned Shuffle, PermuteImm; if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) @@ -24722,6 +24804,20 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root, return true; } + if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT, + PermuteImm)) { + if (Depth == 1 && Root.getOpcode() == Shuffle) + return false; // Nothing to do! + Res = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Res.getNode()); + Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, + DAG.getConstant(PermuteImm, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll index 68450add580..9bc4b5f55b6 100644 --- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -6,7 +6,7 @@ define void @endless_loop() { ; CHECK-NEXT: vmovaps (%eax), %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] diff --git a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll index eaea7e857ea..2a76e1a66b2 100644 --- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -10,9 +10,9 @@ define void @func() nounwind ssp { ; CHECK-NEXT: vmovups 0, %xmm0 ; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; CHECK-NEXT: vpbroadcastd 32, %xmm3 -; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; CHECK-NEXT: vbroadcastss 32, %xmm3 +; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; CHECK-NEXT: vmulps %ymm0, %ymm2, %ymm2 ; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index cf90e5220fd..59eb4cdb195 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2385,14 +2385,14 @@ define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind { ; X32-LABEL: test_mm256_set1_epi32: ; X32: # BB#0: ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set1_epi32: ; X64: # BB#0: ; X64-NEXT: vmovd %edi, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 @@ -2422,7 +2422,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { ; X64-LABEL: test_mm256_set1_epi64x: ; X64: # BB#0: ; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index dc1414990ee..5e83cd0b603 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -4036,7 +4036,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) { ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd LCPI227_0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] ; AVX512VL-NEXT: retl %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1] ret <4 x double> %res diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index d3ae5498a2b..1914b5134be 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -28,7 +28,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcC: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: vmovq %rdi, %xmm0 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index b7030035444..6151e76ab0b 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -173,13 +173,13 @@ define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtabl ; X32-LABEL: load_splat_8i32_4i32_33333333: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,3,3,3] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_8i32_4i32_33333333: ; X64: ## BB#0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,3,3,3] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: @@ -277,15 +277,13 @@ define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re ; X32-LABEL: load_splat_4i64_2i64_1111: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps (%eax), %xmm0 -; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: load_splat_4i64_2i64_1111: ; X64: ## BB#0: ## %entry -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 401e6ccc5d8..59b7efdf9bf 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -760,7 +760,7 @@ define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwt ; AVX: # BB#0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll index b6f228c7d48..8364915fa0d 100644 --- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll +++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll @@ -39,9 +39,9 @@ define <16 x i8> @test3(<16 x i8> %V) { define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1084818905618843912,506097522914230528] -; CHECK-NEXT: movdqa %xmm1, (%rdi) -; CHECK-NEXT: pshufb %xmm1, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1084818905618843912,506097522914230528] +; CHECK-NEXT: movaps %xmm1, (%rdi) +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: retq %1 = insertelement <2 x i64> undef, i64 1084818905618843912, i32 0 %2 = insertelement <2 x i64> %1, i64 506097522914230528, i32 1 diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index e028a41b1ec..6d51fb54f8b 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -207,7 +207,7 @@ define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64: ## BB#0: ## %entry ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > @@ -220,7 +220,7 @@ define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64: ## BB#0: ## %entry ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll index bfe1a65eb54..6c93f69ba61 100644 --- a/llvm/test/CodeGen/X86/vector-compare-results.ll +++ b/llvm/test/CodeGen/X86/vector-compare-results.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 @@ -147,7 +146,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind { ; AVX2-LABEL: test_cmp_v4f64: ; AVX2: # BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -684,10 +683,10 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX2-LABEL: test_cmp_v8f64: ; AVX2: # BB#0: ; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero @@ -2143,10 +2142,10 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; AVX2-LABEL: test_cmp_v16f64: ; AVX2: # BB#0: ; AVX2-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] @@ -2155,10 +2154,10 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 47036117e61..2651063379f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -925,7 +925,7 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu( ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 28947e7b670..0d50205aa4a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -159,7 +159,7 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_11: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1> ret <2 x double> %shuffle @@ -217,7 +217,7 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ; ; AVX-LABEL: shuffle_v2f64_33: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3> ret <2 x double> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index 8516e523aa5..76a403ae358 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -227,7 +227,7 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { ; ; AVX-LABEL: shuffle_v4f32_0011: ; AVX: # BB#0: -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> ret <4 x float> %shuffle @@ -240,7 +240,7 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { ; ; AVX-LABEL: shuffle_v4f32_2233: ; AVX: # BB#0: -; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> ret <4 x float> %shuffle @@ -1952,30 +1952,16 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) { } define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) { -; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32: -; SSE2: # BB#0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0,0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32: -; SSE3: # BB#0: -; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32: -; SSSE3: # BB#0: -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32: -; SSE41: # BB#0: -; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; SSE41-NEXT: retq +; SSE-LABEL: broadcast_v4f32_0101_from_v2f32: +; SSE: # BB#0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: broadcast_v4f32_0101_from_v2f32: ; AVX: # BB#0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: retq %1 = load <2 x float>, <2 x float>* %x, align 1 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 23fbba61dab..e838278a227 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1443,7 +1443,7 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z ; AVX1: # BB#0: ; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 9c7ec845450..5bc04ae0770 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -488,7 +488,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # BB#0: -; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef> ret <4 x double> %shuffle @@ -557,7 +557,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0001: ; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -579,7 +579,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -641,8 +641,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1000: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -800,7 +800,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] ; AVX1-NEXT: retq @@ -1320,7 +1320,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) { define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) { ; AVX1-LABEL: splat_mem_v4i64_from_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1397,7 +1397,7 @@ define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) { define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) { ; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index ceda13c22ee..f9448d9cf96 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -152,8 +152,8 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) { define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00112233: ; AVX1: # BB#0: -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -196,7 +196,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_08080808: ; AVX1: # BB#0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -213,7 +213,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_08084c4c: ; ALL: # BB#0: ; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> ret <8 x float> %shuffle @@ -907,8 +907,8 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000010: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -924,8 +924,8 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000200: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -941,8 +941,8 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00003000: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1042,8 +1042,8 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00112233: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1059,8 +1059,8 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00001111: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1091,7 +1091,7 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08080808: ; AVX1: # BB#0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1108,7 +1108,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_08084c4c: ; AVX1: # BB#0: ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08084c4c: @@ -1239,8 +1239,8 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_091b2d3f: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -1257,7 +1257,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_09ab1def: ; AVX1: # BB#0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq @@ -2050,7 +2050,7 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_z0U2zUz6: @@ -2066,7 +2066,7 @@ define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_1U3z5zUU: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 3ab7d0be530..fea24ab4545 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -67,7 +67,7 @@ define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) { define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32_unpckh: ; ALL: # BB#0: -; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; ALL-NEXT: retq %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>) ret <4 x float> %1 @@ -76,7 +76,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32_unpckl: ; ALL: # BB#0: -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; ALL-NEXT: retq %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>) ret <4 x float> %1 @@ -167,7 +167,7 @@ define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) { define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { ; ALL-LABEL: combine_vpermilvar_4f32_4stage: ; ALL: # BB#0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7] +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; ALL-NEXT: retq %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>) @@ -177,24 +177,10 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { } define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { -; AVX1-LABEL: combine_vpermilvar_8f32_4stage: -; AVX1: # BB#0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vpermilvar_8f32_4stage: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7,24,25,26,27,16,17,18,19,28,29,30,31,20,21,22,23] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vpermilvar_8f32_4stage: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7,24,25,26,27,16,17,18,19,28,29,30,31,20,21,22,23] -; AVX512F-NEXT: retq +; ALL-LABEL: combine_vpermilvar_8f32_4stage: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; ALL-NEXT: retq %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>) %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 1f8531a91b4..b6e66c980e9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -51,7 +51,7 @@ define <4 x i64> @combine_permq_pshufb(<4 x i64> %a0) { ; CHECK-LABEL: combine_permq_pshufb: ; CHECK: # BB#0: ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> %2 = bitcast <4 x i64> %1 to <32 x i8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 658aab594f4..f89294c3013 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -212,6 +212,43 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> * ret <16 x float> %res0 } +define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { +; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: retq + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res0 +} +define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) { +; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_load: +; CHECK: # BB#0: +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) + ret <16 x float> %res0 +} +define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { +; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: retq + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) + ret <16 x float> %res0 +} +define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { +; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: +; CHECK: # BB#0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: retq + %x0 = load <16 x float>, <16 x float> *%p0 + %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) + ret <16 x float> %res0 +} + define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { ; CHECK-LABEL: combine_vpermt2var_16i32_identity: ; CHECK: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index e893d9f8262..4667cb45768 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -94,13 +94,12 @@ define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) { define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: combine_pshufb_palignr: ; SSE: # BB#0: -; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_pshufb_palignr: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index aa7363669a0..266a3658eda 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2440,7 +2440,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) { ; ; AVX-LABEL: combine_undef_input_test9: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> @@ -2631,7 +2631,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) { ; ; AVX-LABEL: combine_undef_input_test19: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> |