From 4a97df01c43170eed51d2cca0dc779d9d2ca6419 Mon Sep 17 00:00:00 2001 From: Michael Zuckerman Date: Mon, 25 Sep 2017 14:50:38 +0000 Subject: [X86][LLVM]Expanding Supports lowerInterleavedStore() in X86InterleavedAccess (VF8 stride 4): This patch expands the support of lowerInterleavedStore to 8x8i stride 4. LLVM creates suboptimal shuffle code-gen for AVX2. In overall, this patch is a specific fix for the pattern (Strid=4 VF=8) and we plan to include more patterns in the future. The patch goal is to optimize the following sequence: At the end of the computation, we have xmm2, xmm0, xmm12 and xmm3 holding each 8 chars: c0, c1, , c7 m0, m1, , m7 y0, y1, , y7 k0, k1, ., k7 And these need to be transposed/interleaved and stored like so: c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 .... Reviewers DavidKreitzer Farhana zvi igorb guyblank RKSimon Ayal Differential Revision: https://reviews.llvm.org/D36058 Change-Id: I3cc5c2ca5d6318901c192a4428493b99ef424c32 llvm-svn: 314109 --- .../Transforms/InterleavedAccess/X86/interleavedStore.ll | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'llvm/test/Transforms/InterleavedAccess') diff --git a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll index 14695e733b1..5819b5cd40b 100644 --- a/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll +++ b/llvm/test/Transforms/InterleavedAccess/X86/interleavedStore.ll @@ -66,13 +66,21 @@ ret void define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) { ; CHECK-LABEL: @interleaved_store_vf8_i8_stride4( ; CHECK-NEXT: [[V1:%.*]] = shufflevector <8 x i8> [[X1:%.*]], <8 x i8> [[X2:%.*]], <16 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X2]], <8 x i8> [[X3:%.*]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <32 x i32> -; CHECK-NEXT: store <32 x i8> [[INTERLEAVED_VEC]], <32 x i8>* [[P:%.*]] +; CHECK-NEXT: [[V2:%.*]] = shufflevector <8 x i8> [[X3:%.*]], <8 x i8> [[X4:%.*]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[V1]], <16 x i8> [[V2]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP8]], <32 x i32> +; CHECK-NEXT: store <32 x i8> [[TMP9]], <32 x i8>* [[P:%.*]] ; CHECK-NEXT: ret void ; %v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> - %v2 = shufflevector <8 x i8> %x2, <8 x i8> %x3, <16 x i32> + %v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> %interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> store <32 x i8> %interleaved.vec, <32 x i8>* %p ret void -- cgit v1.2.3