From 4a97df01c43170eed51d2cca0dc779d9d2ca6419 Mon Sep 17 00:00:00 2001 From: Michael Zuckerman Date: Mon, 25 Sep 2017 14:50:38 +0000 Subject: [X86][LLVM]Expanding Supports lowerInterleavedStore() in X86InterleavedAccess (VF8 stride 4): This patch expands the support of lowerInterleavedStore to 8x8i stride 4. LLVM creates suboptimal shuffle code-gen for AVX2. In overall, this patch is a specific fix for the pattern (Strid=4 VF=8) and we plan to include more patterns in the future. The patch goal is to optimize the following sequence: At the end of the computation, we have xmm2, xmm0, xmm12 and xmm3 holding each 8 chars: c0, c1, , c7 m0, m1, , m7 y0, y1, , y7 k0, k1, ., k7 And these need to be transposed/interleaved and stored like so: c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 .... Reviewers DavidKreitzer Farhana zvi igorb guyblank RKSimon Ayal Differential Revision: https://reviews.llvm.org/D36058 Change-Id: I3cc5c2ca5d6318901c192a4428493b99ef424c32 llvm-svn: 314109 --- llvm/test/CodeGen/X86/x86-interleaved-access.ll | 52 +++++++++------------- .../InterleavedAccess/X86/interleavedStore.ll | 16 +++++-- 2 files changed, 33 insertions(+), 35 deletions(-) (limited to 'llvm/test') diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 3bdd2dee155..a65c42165a7 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -936,47 +936,37 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { ; AVX1-LABEL: interleaved_store_vf8_i8_stride4: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX-LABEL: interleaved_store_vf8_i8_stride4: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,5,13,6,14,7,15,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,1,9,2,10,3,11,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX-NEXT: vmovdqa %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> -%v2 = shufflevector <8 x i8> %x2, <8 x i8> %x3, <16 x i32> +%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> store <32 x i8> %interleaved.vec, <32 x i8>* %p ret void diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll index 14695e733b1..5819b5cd40b 100644 --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -66,13 +66,21 @@ ret void define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { ; CHECK-LABEL: @interleaved_store_vf8_i8_stride4( ; CHECK-NEXT: [[V1:%.*]] = shufflevector <8 x i8> [[X1:%.*]], <8 x i8> [[X2:%.*]], <16 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X2]], <8 x i8> [[X3:%.*]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <32 x i32> -; CHECK-NEXT: store <32 x i8> [[INTERLEAVED_VEC]], <32 x i8>* [[P:%.*]] +; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X3:%.*]], <8 x i8> [[X4:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> +; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]] ; CHECK-NEXT: ret void ; %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> - %v2 = shufflevector <8 x i8> %x2, <8 x i8> %x3, <16 x i32> + %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> store <32 x i8> %interleaved.vec, <32 x i8>* %p ret void -- cgit v1.2.3