diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 36 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 22 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll | 20 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 48 |
5 files changed, 71 insertions, 65 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4f4f8c9b47b..3351f1321c8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12606,8 +12606,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, {0, 1, 2, 3, 0, 1, 2, 3}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) { - MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, @@ -12616,8 +12615,39 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } - // Try to lower to to vshuf64x2/vshuf32x4. assert(WidenedMask.size() == 4); + + // See if this is an insertion of the lower 128-bits of V2 into V1. + bool IsInsert = true; + int V2Index = -1; + for (int i = 0; i < 4; ++i) { + assert(WidenedMask[i] >= -1); + if (WidenedMask[i] < 0) + continue; + + // Make sure all V1 subvectors are in place. + if (WidenedMask[i] < 4) { + if (WidenedMask[i] != i) { + IsInsert = false; + break; + } + } else { + // Make sure we only have a single V2 index and its the lowest 128-bits. + if (V2Index >= 0 || WidenedMask[i] != 4) { + IsInsert = false; + break; + } + V2Index = i; + } + } + if (IsInsert && V2Index >= 0) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); + SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(0, DL)); + return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); + } + + // Try to lower to to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index ad5656bbdc8..3c649e18bc3 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2977,13 +2977,12 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; CHECK-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vblendmps %zmm3, %zmm2, %zmm0 {%k1} -; CHECK-NEXT: vmovaps %zmm3, %zmm1 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm0, %zmm0 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -2999,13 +2998,12 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; CHECK-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vpblendmd %zmm3, %zmm2, %zmm0 {%k1} -; CHECK-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index 9403a69e88e..f4cf22c5ed3 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -79,14 +79,12 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; CHECK-NEXT: vmovapd %zmm0, %zmm4 -; CHECK-NEXT: vpermt2pd %zmm1, %zmm3, %zmm4 +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vblendmpd %zmm4, %zmm2, %zmm2 {%k1} -; CHECK-NEXT: vpermt2pd %zmm1, %zmm3, %zmm0 {%k1} {z} +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) @@ -122,14 +120,12 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i6 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; CHECK: ## BB#0: ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def> -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 -; CHECK-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vpblendmq %zmm4, %zmm2, %zmm2 {%k1} -; CHECK-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm4, %zmm0, %zmm0 +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index d11c789dfaf..b951bf1c97e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -512,10 +512,9 @@ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) { ; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: ; ALL: # BB#0: -; ALL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; ALL-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vblendmps %zmm3, %zmm2, %zmm0 {%k1} +; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %mask.cast = bitcast i16 %mask to <16 x i1> @@ -539,10 +538,9 @@ define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21 define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) { ; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,9,4,5,6,7] -; ALL-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; ALL-NEXT: kmovw %edi, %k1 -; ALL-NEXT: vpblendmd %zmm3, %zmm2, %zmm0 {%k1} +; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; ALL-NEXT: vmovdqa64 %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %mask.cast = bitcast i16 %mask to <16 x i1> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 485c197d514..365ff3bf63d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2463,14 +2463,12 @@ define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) { define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_89234567: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,9,2,3,4,5,6,7] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_89234567: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,9,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -2479,14 +2477,12 @@ define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_01894567: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,8,9,4,5,6,7] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_01894567: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -2495,14 +2491,12 @@ define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_01238967: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,2,3,8,9,6,7] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_01238967: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,1,0,2,0,3,0,8,0,9,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> ret <8 x double> %shuffle @@ -2511,14 +2505,12 @@ define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) { ; AVX512F-LABEL: shuffle_v8f64_01234589: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,2,3,4,5,8,9] -; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_01234589: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,8,0,9,0] -; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> ret <8 x double> %shuffle @@ -2527,14 +2519,12 @@ define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) { define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_89234567: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,9,2,3,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_89234567: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,9,0,2,0,3,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -2543,14 +2533,12 @@ define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01894567: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,9,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01894567: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,9,0,4,0,5,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -2559,14 +2547,12 @@ define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01238967: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,9,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01238967: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,2,0,3,0,8,0,9,0,6,0,7,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7> ret <8 x i64> %shuffle @@ -2575,14 +2561,12 @@ define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) { ; AVX512F-LABEL: shuffle_v8i64_01234589: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,9] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_01234589: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,2,0,3,0,4,0,5,0,8,0,9,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> ret <8 x i64> %shuffle |