diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2016-12-15 18:43:46 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2016-12-15 18:43:46 +0000 |
| commit | a97358bc8e8c9a98438d0bae0f863cf8b16c8e2b (patch) | |
| tree | 7084a68388911c2b0f252377dc622bdd587dba4d | |
| parent | 2c8de192a1eab0f60c3c3ccc3652e22cba275a82 (diff) | |
| download | bcm5719-llvm-a97358bc8e8c9a98438d0bae0f863cf8b16c8e2b.tar.gz bcm5719-llvm-a97358bc8e8c9a98438d0bae0f863cf8b16c8e2b.zip | |
[x86] use a single shufps for 256-bit vectors when it can save instructions
This is the 256-bit counterpart to the 128-bit transform checked in here:
https://reviews.llvm.org/rL289837
This patch is based on the draft by @sroland (Roland Scheidegger) that is
attached to PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
llvm-svn: 289846
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 14 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pmul.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 43 |
3 files changed, 31 insertions, 42 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9afc5902991..2b496b04f01 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12232,7 +12232,9 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector<int, 4> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + bool Is128BitLaneRepeatedShuffle = + is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); + if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, @@ -12273,6 +12275,16 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } + // Assume that a single SHUFPS is faster than an alternative sequence of + // multiple instructions (even if the CPU has a domain penalty). + // If some CPU is harmed by the domain switch, we can fix it in a later pass. + if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { + SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); + SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); + SDValue ShufPS = DAG.getVectorShuffle(MVT::v8f32, DL, CastV1, CastV2, Mask); + return DAG.getBitcast(MVT::v8i32, ShufPS); + } + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index b611b8cc5fe..32a7c376170 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1390,17 +1390,15 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; ; AVX2-LABEL: mul_v8i64_zero_upper: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1410,9 +1408,7 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index e9c4e371bb6..8b28a336eb7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1223,47 +1223,28 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { } define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) { -; AVX1-LABEL: shuffle_v8i32_8823cc67: -; AVX1: # BB#0: -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] -; AVX1-NEXT: retq -; -; AVX2OR512VL-LABEL: shuffle_v8i32_8823cc67: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; ALL-LABEL: shuffle_v8i32_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] +; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) { -; AVX1-LABEL: shuffle_v8i32_9832dc76: -; AVX1: # BB#0: -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] -; AVX1-NEXT: retq -; -; AVX2OR512VL-LABEL: shuffle_v8i32_9832dc76: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; AVX2OR512VL-NEXT: retq +; ALL-LABEL: shuffle_v8i32_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] +; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> ret <8 x i32> %shuffle } define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) { -; AVX1-LABEL: shuffle_v8i32_9810dc54: -; AVX1: # BB#0: -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-NEXT: retq -; -; AVX2OR512VL-LABEL: shuffle_v8i32_9810dc54: -; AVX2OR512VL: # BB#0: -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; ALL-LABEL: shuffle_v8i32_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> ret <8 x i32> %shuffle } |

