diff options
| author | Chandler Carruth <chandlerc@gmail.com> | 2018-02-17 02:26:25 +0000 |
|---|---|---|
| committer | Chandler Carruth <chandlerc@gmail.com> | 2018-02-17 02:26:25 +0000 |
| commit | a1d6107b14b3ceaf5a34a00c1326775ac72e353f (patch) | |
| tree | 4a43765b8d5dc177a82db71b3fc45109399ff035 /llvm/test | |
| parent | 841ca95219c9ddb8241372a592b5b923a0087a2a (diff) | |
| download | bcm5719-llvm-a1d6107b14b3ceaf5a34a00c1326775ac72e353f.tar.gz bcm5719-llvm-a1d6107b14b3ceaf5a34a00c1326775ac72e353f.zip | |
[DAG, X86] Revert r324797, r324491, and r324359.
Sadly, r324359 caused at least PR36312. There is a patch out for review
but it seems to be taking a bit and we've already had these crashers in
tree for too long. We're hitting this PR in real code now and are
blocked on shipping new compilers as a consequence so I'm reverting us
back to green.
Sorry for the churn due to the stacked changes that I had to revert. =/
llvm-svn: 325420
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/X86/avg.ll | 114 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-vbroadcastf128.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 191 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/foldmem_cycle.ll | 34 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/i256-add.ll | 145 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/load-op-store-fusion.ll | 32 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/masked_memop.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/merge-consecutive-stores.ll | 9 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/nontemporal.ll | 82 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/store_op_load_fold2.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/subvector-broadcast.ll | 48 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/var-permute-256.ll | 540 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll | 3 |
15 files changed, 738 insertions, 496 deletions
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index dd923170f8f..8e1e5f3b5ca 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -90,12 +90,12 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: @@ -545,18 +545,18 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm2 +; SSE2-NEXT: pavgb 32(%rsi), %xmm0 ; SSE2-NEXT: pavgb 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8: @@ -582,23 +582,23 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -678,12 +678,12 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: @@ -729,18 +729,18 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm2 +; SSE2-NEXT: pavgw 32(%rsi), %xmm0 ; SSE2-NEXT: pavgw 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16: @@ -766,23 +766,23 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -891,9 +891,9 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pavgb (%rsi), %xmm0 -; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1072,9 +1072,9 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pavgw (%rsi), %xmm0 -; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1124,14 +1124,14 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: pavgw (%rsi), %xmm0 ; SSE2-NEXT: pavgw 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw 32(%rsi), %xmm2 -; SSE2-NEXT: pavgw 48(%rsi), %xmm3 -; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: pavgw 32(%rdi), %xmm3 +; SSE2-NEXT: pavgw 48(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1160,9 +1160,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1171,9 +1171,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll index b5026437153..7fdbf31a993 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll @@ -235,16 +235,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index 3ae6c0b9d81..528dfcd6f8d 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1065,7 +1065,9 @@ define void @isel_crash_16b(i8* %cV_R.addr) { ; X64: ## %bb.0: ## %eintry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastb (%rdi), %xmm1 +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1116,7 +1118,9 @@ define void @isel_crash_32b(i8* %cV_R.addr) { ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vpbroadcastb (%rdi), %ymm1 +; X64-NEXT: movb (%rdi), %al +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastb %xmm1, %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1156,7 +1160,9 @@ define void @isel_crash_8w(i16* %cV_R.addr) { ; X64: ## %bb.0: ## %entry ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastw (%rdi), %xmm1 +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) ; X64-NEXT: retq @@ -1207,7 +1213,9 @@ define void @isel_crash_16w(i16* %cV_R.addr) { ; X64-NEXT: subq $128, %rsp ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vpbroadcastw (%rdi), %ymm1 +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: vmovd %eax, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %ymm1 ; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; X64-NEXT: movq %rbp, %rsp @@ -1243,14 +1251,26 @@ define void @isel_crash_4d(i32* %cV_R.addr) { ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_4d: -; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vbroadcastss (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_4d: +; X64-AVX2: ## %bb.0: ## %entry +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movl (%rdi), %eax +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_4d: +; X64-AVX512VL: ## %bb.0: ## %entry +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movl (%rdi), %eax +; X64-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1287,24 +1307,46 @@ define void @isel_crash_8d(i32* %cV_R.addr) { ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_8d: -; X64: ## %bb.0: ## %eintry -; X64-NEXT: pushq %rbp -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: .cfi_def_cfa_register %rbp -; X64-NEXT: andq $-32, %rsp -; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastss (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_8d: +; X64-AVX2: ## %bb.0: ## %eintry +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX2-NEXT: .cfi_offset %rbp, -16 +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $128, %rsp +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: movl (%rdi), %eax +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_8d: +; X64-AVX512VL: ## %bb.0: ## %eintry +; X64-AVX512VL-NEXT: pushq %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 +; X64-AVX512VL-NEXT: movq %rsp, %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX512VL-NEXT: andq $-32, %rsp +; X64-AVX512VL-NEXT: subq $128, %rsp +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX512VL-NEXT: movl (%rdi), %eax +; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1 +; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq %rbp, %rsp +; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper +; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 @@ -1328,20 +1370,33 @@ define void @isel_crash_2q(i64* %cV_R.addr) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vpbroadcastq (%eax), %xmm1 +; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vpbroadcastq %xmm1, %xmm1 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_2q: -; X64: ## %bb.0: ## %entry -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vpbroadcastq (%rdi), %xmm1 -; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_2q: +; X64-AVX2: ## %bb.0: ## %entry +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq (%rdi), %rax +; X64-AVX2-NEXT: vmovq %rax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 +; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_2q: +; X64-AVX512VL: ## %bb.0: ## %entry +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq (%rdi), %rax +; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm1 +; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: retq entry: %__a.addr.i = alloca <2 x i64>, align 16 %__b.addr.i = alloca <2 x i64>, align 16 @@ -1378,24 +1433,46 @@ define void @isel_crash_4q(i64* %cV_R.addr) { ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; -; X64-LABEL: isel_crash_4q: -; X64: ## %bb.0: ## %eintry -; X64-NEXT: pushq %rbp -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rsp, %rbp -; X64-NEXT: .cfi_def_cfa_register %rbp -; X64-NEXT: andq $-32, %rsp -; X64-NEXT: subq $128, %rsp -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovaps %ymm0, (%rsp) -; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movq %rbp, %rsp -; X64-NEXT: popq %rbp -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X64-AVX2-LABEL: isel_crash_4q: +; X64-AVX2: ## %bb.0: ## %eintry +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX2-NEXT: .cfi_offset %rbp, -16 +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $128, %rsp +; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: movq (%rdi), %rax +; X64-AVX2-NEXT: vmovq %rax, %xmm1 +; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512VL-LABEL: isel_crash_4q: +; X64-AVX512VL: ## %bb.0: ## %eintry +; X64-AVX512VL-NEXT: pushq %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16 +; X64-AVX512VL-NEXT: movq %rsp, %rbp +; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp +; X64-AVX512VL-NEXT: andq $-32, %rsp +; X64-AVX512VL-NEXT: subq $128, %rsp +; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX512VL-NEXT: movq (%rdi), %rax +; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1 +; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) +; X64-AVX512VL-NEXT: movq %rbp, %rsp +; X64-AVX512VL-NEXT: popq %rbp +; X64-AVX512VL-NEXT: vzeroupper +; X64-AVX512VL-NEXT: retq eintry: %__a.addr.i = alloca <4 x i64>, align 16 %__b.addr.i = alloca <4 x i64>, align 16 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll index 996e6796616..254cdfdd8cb 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll @@ -271,16 +271,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovaps (%ecx), %xmm0 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-NEXT: vmovaps %ymm1, (%eax) +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: PR29088: ; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-NEXT: vmovaps %ymm1, (%rsi) +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll index 2bf69cfadcf..c5ecb1559b4 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -186,23 +186,26 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { ; X64-AVX512VL-LABEL: PR29088: ; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) +; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: PR29088: ; X64-AVX512BWVL: ## %bb.0: +; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) +; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BWVL-NEXT: retq ; ; X64-AVX512DQVL-LABEL: PR29088: ; X64-AVX512DQVL: ## %bb.0: +; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi) +; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQVL-NEXT: retq %ld = load <4 x i32>, <4 x i32>* %p0 store <8 x float> zeroinitializer, <8 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/foldmem_cycle.ll b/llvm/test/CodeGen/X86/foldmem_cycle.ll deleted file mode 100644 index 78ac9f29b16..00000000000 --- a/llvm/test/CodeGen/X86/foldmem_cycle.ll +++ /dev/null @@ -1,34 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64 - -; The load should not be merged with the and asit causes a cycle in the DAG. - -define void @foo() { -; X64-LABEL: foo: -; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: .cfi_offset %rbx, -16 -; X64-NEXT: movl (%rax), %ebx -; X64-NEXT: callq bar -; X64-NEXT: testl %ebx, %eax -; X64-NEXT: jne .LBB0_2 -; X64-NEXT: # %bb.1: # %if.then -; X64-NEXT: popq %rbx -; X64-NEXT: retq -; X64-NEXT: .LBB0_2: # %if.end -entry: - %0 = load i32, i32* undef - %call = tail call i32 @bar() - %and = and i32 %call, %0 - %tobool = icmp eq i32 %and, 0 - br i1 %tobool, label %if.then, label %if.end - -if.then: - ret void - -if.end: - unreachable -} - -declare i32 @bar() diff --git a/llvm/test/CodeGen/X86/i256-add.ll b/llvm/test/CodeGen/X86/i256-add.ll index 85a885a4315..36d838a68cb 100644 --- a/llvm/test/CodeGen/X86/i256-add.ll +++ b/llvm/test/CodeGen/X86/i256-add.ll @@ -9,30 +9,40 @@ define void @add(i256* %p, i256* %q) nounwind { ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx +; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: addl %ecx, (%eax) -; X32-NEXT: adcl %edx, 4(%eax) -; X32-NEXT: adcl %ebp, 8(%eax) -; X32-NEXT: adcl %ebx, 12(%eax) -; X32-NEXT: adcl %edi, 16(%eax) -; X32-NEXT: adcl %esi, 20(%eax) -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, 24(%eax) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; X32-NEXT: adcl %ecx, 28(%eax) -; X32-NEXT: addl $8, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %edx +; X32-NEXT: movl 4(%ecx), %ebx +; X32-NEXT: movl 28(%eax), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl 24(%eax), %ebp +; X32-NEXT: addl (%eax), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: adcl 4(%eax), %ebx +; X32-NEXT: adcl 8(%eax), %edi +; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%eax), %edi +; X32-NEXT: movl 12(%eax), %edx +; X32-NEXT: movl 16(%eax), %esi +; X32-NEXT: adcl 12(%ecx), %edx +; X32-NEXT: adcl 16(%ecx), %esi +; X32-NEXT: adcl 20(%ecx), %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl 24(%ecx), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload +; X32-NEXT: adcl %ebp, 28(%ecx) +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, 8(%ecx) +; X32-NEXT: movl %ebx, 4(%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: movl %edx, 12(%ecx) +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edi, 20(%ecx) +; X32-NEXT: movl %eax, 24(%ecx) +; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -41,14 +51,17 @@ define void @add(i256* %p, i256* %q) nounwind { ; ; X64-LABEL: add: ; X64: # %bb.0: -; X64-NEXT: movq 24(%rsi), %rax -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: movq 8(%rsi), %rsi -; X64-NEXT: addq %rdx, (%rdi) -; X64-NEXT: adcq %rsi, 8(%rdi) -; X64-NEXT: adcq %rcx, 16(%rdi) -; X64-NEXT: adcq %rax, 24(%rdi) +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %rdx +; X64-NEXT: movq 24(%rsi), %r8 +; X64-NEXT: addq (%rsi), %rcx +; X64-NEXT: adcq 8(%rsi), %rdx +; X64-NEXT: adcq 16(%rsi), %rax +; X64-NEXT: adcq %r8, 24(%rdi) +; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q @@ -64,28 +77,35 @@ define void @sub(i256* %p, i256* %q) nounwind { ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $8, %esp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill -; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X32-NEXT: movl 20(%eax), %esi -; X32-NEXT: movl 16(%eax), %edi -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl 8(%eax), %ebp -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: subl %ecx, (%eax) -; X32-NEXT: sbbl %edx, 4(%eax) -; X32-NEXT: sbbl %ebp, 8(%eax) -; X32-NEXT: sbbl %ebx, 12(%eax) -; X32-NEXT: sbbl %edi, 16(%eax) -; X32-NEXT: sbbl %esi, 20(%eax) -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: sbbl %ecx, 24(%eax) -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload -; X32-NEXT: sbbl %ecx, 28(%eax) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 16(%ecx), %eax +; X32-NEXT: movl 12(%ecx), %edx +; X32-NEXT: movl 8(%ecx), %edi +; X32-NEXT: movl (%ecx), %ebx +; X32-NEXT: movl 4(%ecx), %ebp +; X32-NEXT: subl (%esi), %ebx +; X32-NEXT: sbbl 4(%esi), %ebp +; X32-NEXT: sbbl 8(%esi), %edi +; X32-NEXT: sbbl 12(%esi), %edx +; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: sbbl 16(%esi), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %edx +; X32-NEXT: sbbl 20(%esi), %edx +; X32-NEXT: movl 24(%ecx), %eax +; X32-NEXT: sbbl 24(%esi), %eax +; X32-NEXT: movl 28(%esi), %esi +; X32-NEXT: sbbl %esi, 28(%ecx) +; X32-NEXT: movl %edi, 8(%ecx) +; X32-NEXT: movl %ebp, 4(%ecx) +; X32-NEXT: movl %ebx, (%ecx) +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload +; X32-NEXT: movl %esi, 12(%ecx) +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: movl %esi, 16(%ecx) +; X32-NEXT: movl %edx, 20(%ecx) +; X32-NEXT: movl %eax, 24(%ecx) ; X32-NEXT: addl $8, %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi @@ -95,14 +115,17 @@ define void @sub(i256* %p, i256* %q) nounwind { ; ; X64-LABEL: sub: ; X64: # %bb.0: -; X64-NEXT: movq 24(%rsi), %rax -; X64-NEXT: movq 16(%rsi), %rcx -; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: movq 8(%rsi), %rsi -; X64-NEXT: subq %rdx, (%rdi) -; X64-NEXT: sbbq %rsi, 8(%rdi) -; X64-NEXT: sbbq %rcx, 16(%rdi) -; X64-NEXT: sbbq %rax, 24(%rdi) +; X64-NEXT: movq 16(%rdi), %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %rdx +; X64-NEXT: movq 24(%rsi), %r8 +; X64-NEXT: subq (%rsi), %rcx +; X64-NEXT: sbbq 8(%rsi), %rdx +; X64-NEXT: sbbq 16(%rsi), %rax +; X64-NEXT: sbbq %r8, 24(%rdi) +; X64-NEXT: movq %rax, 16(%rdi) +; X64-NEXT: movq %rdx, 8(%rdi) +; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq %a = load i256, i256* %p %b = load i256, i256* %q diff --git a/llvm/test/CodeGen/X86/load-op-store-fusion.ll b/llvm/test/CodeGen/X86/load-op-store-fusion.ll deleted file mode 100644 index ee0d62d1341..00000000000 --- a/llvm/test/CodeGen/X86/load-op-store-fusion.ll +++ /dev/null @@ -1,32 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 - -; This test makes sure we do not merge both load-op-store pairs here as it causes a cycle. - -define i8* @fn(i32 %i.015.i, [64 x i64]* %data.i) { -; X32-LABEL: fn: -; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl (%ecx,%eax,8), %edx -; X32-NEXT: addl $1, %edx -; X32-NEXT: adcl $0, 4(%ecx,%eax,8) -; X32-NEXT: movl %edx, (%ecx,%eax,8) -; X32-NEXT: xorl %eax, %eax -; X32-NEXT: retl -; -; X64-LABEL: fn: -; X64: # %bb.0: # %entry -; X64-NEXT: movslq %edi, %rax -; X64-NEXT: incq (%rsi,%rax,8) -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: retq -entry: - %arrayidx.i6 = getelementptr inbounds [64 x i64], [64 x i64]* %data.i, i32 0, i32 %i.015.i - %x8 = load volatile i64, i64* %arrayidx.i6, align 8 - %inc.i7 = add i64 %x8, 1 - store volatile i64 %inc.i7, i64* %arrayidx.i6, align 8 - ret i8* null -} - diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index aa6ae096445..4a250205051 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -1264,7 +1264,8 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ; AVX-LABEL: load_one_mask_bit_set5: ; AVX: ## %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores.ll index 4f511ef99e5..af5fb478e52 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores.ll @@ -10,11 +10,12 @@ define i32 @foo (i64* %so) nounwind uwtable ssp { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, 28(%eax) ; CHECK-NEXT: movl $0, 24(%eax) -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: cmpl 16(%eax), %ecx -; CHECK-NEXT: movl $0, 16(%eax) -; CHECK-NEXT: sbbl 20(%eax), %ecx +; CHECK-NEXT: movl 20(%eax), %ecx ; CHECK-NEXT: movl $0, 20(%eax) +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: cmpl 16(%eax), %edx +; CHECK-NEXT: movl $0, 16(%eax) +; CHECK-NEXT: sbbl %ecx, %edx ; CHECK-NEXT: setl %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: negl %eax diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll index 472c3e4774c..f53982a8542 100644 --- a/llvm/test/CodeGen/X86/nontemporal.ll +++ b/llvm/test/CodeGen/X86/nontemporal.ll @@ -13,35 +13,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X32-SSE-NEXT: andl $-16, %esp ; X32-SSE-NEXT: subl $16, %esp ; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; X32-SSE-NEXT: movl 12(%ebp), %ecx +; X32-SSE-NEXT: movl 12(%ebp), %eax ; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4 ; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5 ; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6 -; X32-SSE-NEXT: movl 8(%ebp), %esi -; X32-SSE-NEXT: movl 80(%ebp), %edx -; X32-SSE-NEXT: movl (%edx), %eax +; X32-SSE-NEXT: movl 8(%ebp), %edx +; X32-SSE-NEXT: movl 80(%ebp), %ecx +; X32-SSE-NEXT: movl (%ecx), %esi ; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movntps %xmm0, (%esi) +; X32-SSE-NEXT: movntps %xmm0, (%edx) ; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm2, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm2, (%edx) ; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntpd %xmm1, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntpd %xmm1, (%edx) ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm6, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm6, (%edx) ; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm5, (%esi) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm5, (%edx) ; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4 -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntdq %xmm4, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movntil %ecx, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax -; X32-SSE-NEXT: movsd %xmm3, (%esi) -; X32-SSE-NEXT: addl (%edx), %eax +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntdq %xmm4, (%edx) +; X32-SSE-NEXT: addl (%ecx), %esi +; X32-SSE-NEXT: movntil %eax, (%edx) +; X32-SSE-NEXT: movl (%ecx), %eax +; X32-SSE-NEXT: addl %esi, %eax +; X32-SSE-NEXT: movsd %xmm3, (%edx) +; X32-SSE-NEXT: addl (%ecx), %eax ; X32-SSE-NEXT: leal -4(%ebp), %esp ; X32-SSE-NEXT: popl %esi ; X32-SSE-NEXT: popl %ebp @@ -55,35 +56,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 ; X32-AVX-NEXT: andl $-16, %esp ; X32-AVX-NEXT: subl $16, %esp ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-AVX-NEXT: movl 12(%ebp), %ecx +; X32-AVX-NEXT: movl 12(%ebp), %eax ; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4 ; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5 ; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6 -; X32-AVX-NEXT: movl 8(%ebp), %edx -; X32-AVX-NEXT: movl 80(%ebp), %esi -; X32-AVX-NEXT: movl (%esi), %eax +; X32-AVX-NEXT: movl 8(%ebp), %ecx +; X32-AVX-NEXT: movl 80(%ebp), %edx +; X32-AVX-NEXT: movl (%edx), %esi ; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-AVX-NEXT: vmovntps %xmm0, (%edx) +; X32-AVX-NEXT: vmovntps %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntpd %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) ; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0 -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovntdq %xmm0, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: movntil %ecx, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax -; X32-AVX-NEXT: vmovsd %xmm3, (%edx) -; X32-AVX-NEXT: addl (%esi), %eax +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx) +; X32-AVX-NEXT: addl (%edx), %esi +; X32-AVX-NEXT: movntil %eax, (%ecx) +; X32-AVX-NEXT: movl (%edx), %eax +; X32-AVX-NEXT: addl %esi, %eax +; X32-AVX-NEXT: vmovsd %xmm3, (%ecx) +; X32-AVX-NEXT: addl (%edx), %eax ; X32-AVX-NEXT: leal -4(%ebp), %esp ; X32-AVX-NEXT: popl %esi ; X32-AVX-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/store_op_load_fold2.ll b/llvm/test/CodeGen/X86/store_op_load_fold2.ll index 674b8d8f938..f47d87f4bb8 100644 --- a/llvm/test/CodeGen/X86/store_op_load_fold2.ll +++ b/llvm/test/CodeGen/X86/store_op_load_fold2.ll @@ -17,14 +17,14 @@ cond_true2732.preheader: ; preds = %entry store i64 %tmp2676.us.us, i64* %tmp2666 ret i32 0 -; INTEL: and {{e..}}, dword ptr [356] -; INTEL: and dword ptr [360], {{e..}} -; FIXME: mov dword ptr [356], {{e..}} +; INTEL: and {{e..}}, dword ptr [360] +; INTEL: and dword ptr [356], {{e..}} +; FIXME: mov dword ptr [360], {{e..}} ; The above line comes out as 'mov 360, eax', but when the register is ecx it works? -; ATT: andl 356, %{{e..}} -; ATT: andl %{{e..}}, 360 -; ATT: movl %{{e..}}, 356 +; ATT: andl 360, %{{e..}} +; ATT: andl %{{e..}}, 356 +; ATT: movl %{{e..}}, 360 } diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index a4477b2375b..bcb7d14f953 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -751,64 +751,72 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: retl ; ; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: ; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 @@ -821,9 +829,10 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X32-AVX-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; @@ -831,56 +840,63 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512BW: # %bb.0: ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) +; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X32-AVX512DQ: # %bb.0: ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) +; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; ; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) +; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: ; X64-AVX512DQ: # %bb.0: +; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) +; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index d74f925939f..6b63e11ca72 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -685,49 +685,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; XOP-NEXT: vmovd %eax, %xmm0 ; XOP-NEXT: vpextrb $1, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $2, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $3, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $4, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $5, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $6, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $7, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $8, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $9, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $10, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $11, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $12, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $13, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $14, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $15, %xmm2, %eax ; XOP-NEXT: andl $31, %eax -; XOP-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl (%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $0, %xmm1, %eax ; XOP-NEXT: andl $31, %eax ; XOP-NEXT: movzbl (%rsp,%rax), %eax @@ -797,49 +812,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl (%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax ; AVX1-NEXT: movzbl (%rsp,%rax), %eax @@ -909,49 +939,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl (%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax ; AVX2-NEXT: movzbl (%rsp,%rax), %eax @@ -1021,49 +1066,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -1133,49 +1193,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax ; AVX512DQ-NEXT: andl $31, %eax -; AVX512DQ-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax ; AVX512DQ-NEXT: andl $31, %eax ; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax @@ -1245,49 +1320,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax ; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax @@ -2293,49 +2383,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; XOP-NEXT: vmovd %eax, %xmm0 ; XOP-NEXT: vpextrb $1, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $2, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $3, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $4, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $5, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $6, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $7, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $8, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $9, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $10, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $11, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $12, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $13, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $14, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $15, %xmm2, %eax ; XOP-NEXT: andl $15, %eax -; XOP-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; XOP-NEXT: movzbl -24(%rsp,%rax), %eax +; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; XOP-NEXT: vpextrb $0, %xmm1, %eax ; XOP-NEXT: andl $15, %eax ; XOP-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2399,49 +2504,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2505,49 +2625,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2611,49 +2746,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $15, %eax -; AVX512F-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $15, %eax ; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2717,49 +2867,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax ; AVX512DQ-NEXT: andl $15, %eax -; AVX512DQ-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax ; AVX512DQ-NEXT: andl $15, %eax ; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax @@ -2823,49 +2988,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $15, %eax -; AVX512VL-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax +; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $15, %eax ; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index b4950ee49fc..c726a149175 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -47,7 +47,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: movq %rbp, %rsp |

