diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-27 19:48:13 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-27 19:48:13 +0000 |
commit | fe3fac805accd43dbfbc5ee87a065cbfe0a1d80e (patch) | |
tree | 6cc7ecf300604257259ea6a7c6f79ff7a010b3ff /llvm | |
parent | 73e88d394b5e98c5dcc4127f52e0af5bbaa196fe (diff) | |
download | bcm5719-llvm-fe3fac805accd43dbfbc5ee87a065cbfe0a1d80e.tar.gz bcm5719-llvm-fe3fac805accd43dbfbc5ee87a065cbfe0a1d80e.zip |
[X86][SSE] Simplify demanded elements from BROADCAST shuffle source.
If broadcasting from another shuffle, attempt to simplify it.
We can probably generalize this a lot more (embedding in combineX86ShufflesRecursively), but BROADCAST is one of the more troublesome as it accepts inputs of different sizes to the result.
llvm-svn: 323602
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 30 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll | 68 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/widened-broadcast.ll | 14 |
5 files changed, 61 insertions, 55 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 58246c976ce..ad06e996f36 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28242,6 +28242,14 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); + // Match against a VZEXT_MOVL vXi32 zero-extending instruction. + if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) && + isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || @@ -29790,6 +29798,28 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } switch (Opcode) { + case X86ISD::VBROADCAST: { + // If broadcasting from another shuffle, attempt to simplify it. + // TODO - we really need a general SimplifyDemandedVectorElts mechanism. + SDValue Src = N.getOperand(0); + SDValue BC = peekThroughBitcasts(Src); + EVT SrcVT = Src.getValueType(); + EVT BCVT = BC.getValueType(); + if (isTargetShuffle(BC.getOpcode()) && + VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { + unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); + SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(), + SM_SentinelUndef); + for (unsigned i = 0; i != Scale; ++i) + DemandedMask[i] = i; + if (SDValue Res = combineX86ShufflesRecursively( + {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getBitcast(SrcVT, Res)); + } + return SDValue(); + } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll index 905ced47ae8..b6b5a6bcdca 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -459,9 +459,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i3 define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_8xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -470,11 +468,9 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -486,11 +482,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -501,11 +495,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -517,11 +509,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -532,11 +522,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -548,11 +536,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -563,11 +549,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i3 define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -579,11 +563,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1] ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index b0ce6b46e29..921be7c9152 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -203,7 +203,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_08080808: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index dc08ad8a3de..7207d8ecd59 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -345,7 +345,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { ; ; X64AVX2-LABEL: buildvector_v4f32_0404: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; X64AVX2-NEXT: vmovapd %xmm0, (%rdi) ; X64AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll index 1588078a390..ce99d22dbbd 100644 --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -606,16 +606,12 @@ define <8 x i32> @load_splat_8i32_2i32_0101(<2 x i32>* %vp) { ; ; AVX2-LABEL: load_splat_8i32_2i32_0101: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8i32_2i32_0101: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> @@ -642,10 +638,8 @@ define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) { ; ; AVX2-LABEL: load_splat_16i32_2i32_0101: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i32_2i32_0101: |