diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-02-17 10:37:49 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-02-17 10:37:49 +0000 |
| commit | a50e8d3627eb7f56ebdf0351e18759bb50322c37 (patch) | |
| tree | 9a969e91e7a84d0a70a6aed1a8608c0e8fc9498c | |
| parent | 11481f502c40f86037dc1479c4147f543f60a41b (diff) | |
| download | bcm5719-llvm-a50e8d3627eb7f56ebdf0351e18759bb50322c37.tar.gz bcm5719-llvm-a50e8d3627eb7f56ebdf0351e18759bb50322c37.zip | |
[X86][AVX] Support bit-mask integer shuffles for 256-bit integer vectors
AVX1 doesn't support the shuffling of 256-bit integer vectors. For 32/64-bit elements we get around this by shuffling as float/double but for 8/16-bit elements (assuming they can't widen) we currently just split, shuffle as 128-bit vectors and concatenate the results back.
This patch adds the ability to lower using the bit-mask patterns before defaulting to the splitting behaviour. In some cases this ends up matching what AVX2 would do anyhow or what AVX1 does on the split vectors.
Part 1 of 2
Differential Revision: http://reviews.llvm.org/D17292
llvm-svn: 261081
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_int_to_fp.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 17 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-zext.ll | 16 |
4 files changed, 16 insertions, 29 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 37f9df27248..5d6bdea8073 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11480,9 +11480,13 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); - if (ElementBits < 32) - // No floating point type available, decompose into 128-bit vectors. + if (ElementBits < 32) { + // No floating point type available, if we can't use the bit operations + // for masking then decompose into 128-bit vectors. + if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return V; return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 87ade38e426..42924069d9e 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1754,11 +1754,9 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index aa122d14f13..5828d59b76a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -949,19 +949,10 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_ } define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) { -; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255] -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: -; AVX2: # BB#0: -; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: +; ALL: # BB#0: +; ALL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> ret <32 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index b41bb8f7b10..d07f1ea62df 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -145,11 +145,9 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_8i32: @@ -228,12 +226,10 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_16i8_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_4i64: @@ -390,12 +386,10 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_8i16_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_4i64: |

