diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2018-08-25 14:56:05 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2018-08-25 14:56:05 +0000 |
| commit | 8a84c747d2de2e99e035d8e072a00795b406ca6e (patch) | |
| tree | cf484c68d117ba1ba4c8e4ab63a737b067dce94b /llvm/test/CodeGen | |
| parent | 904343f879b34b44185f60d277ab568342d62bf8 (diff) | |
| download | bcm5719-llvm-8a84c747d2de2e99e035d8e072a00795b406ca6e.tar.gz bcm5719-llvm-8a84c747d2de2e99e035d8e072a00795b406ca6e.zip | |
[x86] try harder to use broadcast to load a scalar into vector reg
This is a preliminary step for a preliminary step for D50992.
I noticed that x86 often misses chances to load a scalar directly
into a vector register.
So this patch is just allowing more of those cases to match a
broadcast op in lowerBuildVectorAsBroadcast(). The old code comment
said it doesn't make sense to use a broadcast when we're loading a
single element and everything else is undef, but I think that's the
best case in the improved tests in insert-loaded-scalar.ll. We avoid
scalar-to-vector-register move and/or less efficient shuffling.
Note that there are some existing types that were already producing
a broadcast, but that happens semi-accidentally. Ie, it's not
happening as part of lowerBuildVectorAsBroadcast(). The build vector
gets expanded into load + shuffle, and then shuffle lowering produces
the broadcast.
Description of the other test diffs:
1. avx-basic.ll - replacing load+shufle is a win.
2. sse3-avx-addsub-2.ll - vmovddup vs. vbroadcastss is neutral
3. sse41.ll - don't care - we convert that intrinsic to generic IR now, so this test is deprecated
4. vector-shuffle-128-v8.ll / vector-shuffle-256-v16.ll - pshufb alternatives with an extra instruction are not obviously bad
Differential Revision: https://reviews.llvm.org/D51125
llvm-svn: 340685
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-basic.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/insert-loaded-scalar.ll | 163 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 22 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse41.ll | 10 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 5 |
6 files changed, 101 insertions, 107 deletions
diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index d27a641203f..b7c9b692d89 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -76,8 +76,7 @@ define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp { define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind { ; CHECK-LABEL: VMOVZQI2PQI: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 ; CHECK-NEXT: retq %ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32* %val.i34.i = load i32, i32* %ptrcast.i33.i, align 4 diff --git a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll index ec6b99cf63e..81cb533f442 100644 --- a/llvm/test/CodeGen/X86/insert-loaded-scalar.ll +++ b/llvm/test/CodeGen/X86/insert-loaded-scalar.ll @@ -10,11 +10,16 @@ define <16 x i8> @load8_ins_elt0_v16i8(i8* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 0 ret <16 x i8> %ins @@ -27,11 +32,16 @@ define <8 x i16> @load16_ins_elt0_v8i16(i16* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <8 x i16> undef, i16 %x, i32 0 ret <8 x i16> %ins @@ -105,12 +115,17 @@ define <16 x i8> @load8_ins_eltc_v16i8(i8* %p) nounwind { ; SSE-NEXT: pslld $24, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_eltc_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_eltc_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_eltc_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 3 ret <16 x i8> %ins @@ -147,17 +162,10 @@ define <4 x i32> @load32_ins_eltc_v4i32(i32* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 2 ret <4 x i32> %ins @@ -223,11 +231,16 @@ define <32 x i8> @load8_ins_elt0_v32i8(i8* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 0 ret <32 x i8> %ins @@ -240,11 +253,16 @@ define <16 x i16> @load16_ins_elt0_v16i16(i16* %p) nounwind { ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 0 ret <16 x i16> %ins @@ -328,10 +346,7 @@ define <32 x i8> @load8_ins_eltc_v32i8(i8* %p) nounwind { ; ; AVX2-LABEL: load8_ins_eltc_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: movzbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 21 @@ -356,10 +371,7 @@ define <16 x i16> @load16_ins_eltc_v16i16(i16* %p) nounwind { ; ; AVX2-LABEL: load16_ins_eltc_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 11 @@ -373,18 +385,10 @@ define <8 x i32> @load32_ins_eltc_v8i32(i32* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 7 ret <8 x i32> %ins @@ -397,17 +401,10 @@ define <4 x i64> @load64_ins_eltc_v4i64(i64* %p) nounwind { ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 3 ret <4 x i64> %ins @@ -420,18 +417,10 @@ define <8 x float> @load32_ins_eltc_v8f32(float* %p) nounwind { ; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 5 ret <8 x float> %ins @@ -443,16 +432,10 @@ define <4 x double> @load64_ins_eltc_v4f64(double* %p) nounwind { ; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 3 ret <4 x double> %ins diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll index 9f9fe237ad4..d5631801e39 100644 --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -274,13 +274,21 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: test11: -; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: test11: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = extractelement <4 x float> %A, i32 2 %2 = extractelement <4 x float> %B, i32 2 %sub = fsub float %1, %2 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index c466ee7ed18..b5c37263423 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -97,8 +97,9 @@ define <2 x i64> @pmovzxbq_1() nounwind { ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 -; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: pmovzxbq_1: @@ -121,8 +122,9 @@ define <2 x i64> @pmovzxbq_1() nounwind { ; X64-AVX512: ## %bb.0: ## %entry ; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: %0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 367a72f56c4..c59db9b2f0a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2619,8 +2619,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) { ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 01998a5ac01..3566ebde720 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4597,8 +4597,9 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 { define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; |

