diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 35 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 47 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll | 30 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/subvector-broadcast.ll | 60 |
4 files changed, 99 insertions, 73 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2bec8ea23f4..4180f217cb9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12982,17 +12982,32 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - // Fold two 16-byte subvector loads into one 32-byte load: - // (insert_subvector (insert_subvector undef, (load addr), 0), - // (load addr + 16), Elts/2) + if (OpVT.getVectorElementType() == MVT::i1) + return insert1BitVector(Op, DAG, Subtarget); + + assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && + "Can only insert into 256-bit or 512-bit vectors"); + + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte + // load: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr + 16), Elts/2) // --> load32 addr - // or a 16-byte broadcast: - // (insert_subvector (insert_subvector undef, (load addr), 0), - // (load addr), Elts/2) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr + 32), Elts/2) + // --> load64 addr + // or a 16-byte or 32-byte broadcast: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr), Elts/2) // --> X86SubVBroadcast(load16 addr) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr), Elts/2) + // --> X86SubVBroadcast(load32 addr) if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); if (Idx2 && Idx2->getZExtValue() == 0) { SDValue SubVec2 = Vec.getOperand(1); @@ -13020,12 +13035,6 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, } } - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); - - assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && - "Can only insert into 256-bit or 512-bit vectors"); - if (SubVecVT.is128BitVector()) return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 06c87b9884c..144c44a2962 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1043,6 +1043,28 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", v8f64_info, v4f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; +let Predicates = [HasAVX512] in { +def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v16i16 VR256X:$src), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v32i8 VR256X:$src), 1)>; +} + let Predicates = [HasVLX] in { defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", v8i32x_info, v4i32x_info>, @@ -1106,6 +1128,22 @@ def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), (v2i64 VR128X:$src), 1)>; } +let Predicates = [HasAVX512, NoDQI] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +} + let Predicates = [HasDQI] in { defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, @@ -1119,6 +1157,15 @@ defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; } multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll index c35c95b6fcf..01d6962a6b7 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -6,8 +6,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { ; X64-AVX512-LABEL: test_broadcast_4f64_8f64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = load <4 x double>, <4 x double> *%p @@ -19,8 +18,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { ; X64-AVX512-LABEL: test_broadcast_4i64_8i64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 -; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64> *%p @@ -32,22 +30,19 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQVL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x float>, <8 x float> *%p @@ -59,22 +54,19 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512DQVL-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQVL-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p @@ -93,8 +85,7 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqu16 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; @@ -120,8 +111,7 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; ; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vmovdqu8 (%rdi), %ymm0 -; X64-AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 ; X64-AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 6059b4def65..bed52ed7f42 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -131,8 +131,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { ; X32-AVX512-LABEL: test_broadcast_4f64_8f64: ; X32-AVX512: ## BB#0: ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4f64_8f64: @@ -143,8 +142,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { ; ; X64-AVX512-LABEL: test_broadcast_4f64_8f64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: retq %1 = load <4 x double>, <4 x double> *%p %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> @@ -281,8 +279,7 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { ; X32-AVX512-LABEL: test_broadcast_4i64_8i64: ; X32-AVX512: ## BB#0: ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512-NEXT: vmovdqa64 (%eax), %ymm0 -; X32-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i64_8i64: @@ -293,8 +290,7 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { ; ; X64-AVX512-LABEL: test_broadcast_4i64_8i64: ; X64-AVX512: ## BB#0: -; X64-AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 -; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64> *%p %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> @@ -396,22 +392,19 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X32-AVX512F-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovapd (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8f32_16f32: @@ -422,20 +415,17 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; ; X64-AVX512F-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovapd (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQ-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -550,22 +540,19 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X32-AVX512F-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512F: ## BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: ; X32-AVX512DQ: ## BB#0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovdqa32 (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i32_16i32: @@ -576,20 +563,17 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; ; X64-AVX512F-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512F: ## BB#0: -; X64-AVX512F-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: ; X64-AVX512DQ: ## BB#0: -; X64-AVX512DQ-NEXT: vmovdqa32 (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; X64-AVX512DQ-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -711,8 +695,7 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqu16 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: @@ -736,8 +719,7 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu16 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: @@ -865,8 +847,7 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: ; X32-AVX512BW: ## BB#0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vmovdqu8 (%eax), %ymm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: @@ -890,8 +871,7 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512BW: ## BB#0: -; X64-AVX512BW-NEXT: vmovdqu8 (%rdi), %ymm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: |