diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2019-05-28 13:54:17 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2019-05-28 13:54:17 +0000 |
| commit | d5a8637072f4c556b88156bd2f6237a2ead47d31 (patch) | |
| tree | d311a078955e4ace833057013b1fbb8bc95d4c3a /llvm/test/CodeGen/X86/x86-interleaved-access.ll | |
| parent | 9cd9624fb68db36835229ba5b08f9797b2f9d16b (diff) | |
| download | bcm5719-llvm-d5a8637072f4c556b88156bd2f6237a2ead47d31.tar.gz bcm5719-llvm-d5a8637072f4c556b88156bd2f6237a2ead47d31.zip | |
[x86] split 256-bit store of concatenated vectors
This shows up as a side issue to the main problem for the AVX target example from PR37428:
https://bugs.llvm.org/show_bug.cgi?id=37428 - https://godbolt.org/z/7tpRa3
But as we can see in the pile of existing test diffs, it's actually a widespread problem
that affects any AVX or later target. Apart from a couple of oddballs, I think these are
all improvements for the reasons stated in the code comment: we do not want to enable YMM
unnecessarily (avoid vzeroupper and frequency throttling) and some cores split 256-bit
stores anyway.
We could say that MergeConsecutiveStores() is going overboard on some of these examples,
but that won't solve the problem completely. But that is the reason I'm proposing this as
a lowering rather than a combine: we will infinite loop fighting the merge code if we try
this earlier.
Differential Revision: https://reviews.llvm.org/D62498
llvm-svn: 361822
Diffstat (limited to 'llvm/test/CodeGen/X86/x86-interleaved-access.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/x86-interleaved-access.ll | 73 |
1 files changed, 26 insertions, 47 deletions
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index bff39467c1e..8cd01b631d6 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -341,11 +341,10 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm1, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm4, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride4: @@ -358,11 +357,10 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) -; AVX2-NEXT: vmovdqa %ymm1, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX2-NEXT: vmovdqa %xmm4, 32(%rdi) +; AVX2-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX2-NEXT: vmovdqa %xmm3, (%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride4: @@ -888,37 +886,20 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { } define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { -; AVX1-LABEL: interleaved_store_vf8_i8_stride4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2OR512-LABEL: interleaved_store_vf8_i8_stride4: -; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2OR512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2OR512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2OR512-NEXT: vmovdqa %ymm0, (%rdi) -; AVX2OR512-NEXT: vzeroupper -; AVX2OR512-NEXT: retq +; AVX-LABEL: interleaved_store_vf8_i8_stride4: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm2, (%rdi) +; AVX-NEXT: retq %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31> @@ -1096,10 +1077,9 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, (%rdi) ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: vmovups %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf16_i8_stride3: @@ -1116,10 +1096,9 @@ define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX2-NEXT: vmovdqu %xmm1, (%rdi) ; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: |

