diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-08-24 18:07:53 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-08-24 18:07:53 +0000 |
commit | 941bd6bbae6ba1fbf3d46cfb365d1015703ef448 (patch) | |
tree | 8eddc2a47b1117c80aff16e57288bb09d505d194 | |
parent | 26d9c41ff6b863253e1e9b17bd5acffc30ead692 (diff) | |
download | bcm5719-llvm-941bd6bbae6ba1fbf3d46cfb365d1015703ef448.tar.gz bcm5719-llvm-941bd6bbae6ba1fbf3d46cfb365d1015703ef448.zip |
[X86][SSE] Add support for combining VZEXT_MOVL target shuffles
Includes adding more general support for the pattern: VZEXT_MOVL(VZEXT_LOAD(ptr)) -> VZEXT_LOAD(ptr)
This has unearthed a couple of latent poor codegen issues (MINSS/MAXSS scalar load folding and MOVDDUP/BROADCAST load folding patterns), which will be fixed shortly.
Its also reduced a couple of tests so that they no longer reach the instruction threshold necessary to be combined to PSHUFB (see PR26183).
llvm-svn: 279646
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 43 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/insertps-combine.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_ss_load_fold.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll | 54 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 29 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll | 3 |
7 files changed, 63 insertions, 82 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 293d5a4ac14..a32c1526b68 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4236,6 +4236,21 @@ static bool isUndefOrInRange(ArrayRef<int> Mask, return true; } +/// Return true if Val is undef, zero or if its value falls within the +/// specified range (L, H]. +static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { + return isUndefOrZero(Val) || (Val >= Low && Val < Hi); +} + +/// Return true if every element in Mask is undef, zero or if its value +/// falls within the specified range (L, H]. +static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { + for (int M : Mask) + if (!isUndefOrZeroOrInRange(M, Low, Hi)) + return false; + return true; +} + /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. @@ -25347,6 +25362,21 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned Shuffle, PermuteImm; if (UnaryShuffle) { + // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load + // directly if we don't shuffle the lower element and we shuffle the upper + // (zero) elements within themselves. + if (V1.getOpcode() == X86ISD::VZEXT_LOAD && + (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) { + unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits; + ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); + if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && + isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1), + /*AddTo*/ true); + return true; + } + } + if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! @@ -30502,17 +30532,6 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) { - SDValue Op = peekThroughBitcasts(N->getOperand(0)); - EVT VT = N->getValueType(0), OpVT = Op.getValueType(); - if (Op.getOpcode() == X86ISD::VZEXT_LOAD && - VT.getVectorElementType().getSizeInBits() == - OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getBitcast(VT, Op); - } - return SDValue(); -} - static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); @@ -31498,7 +31517,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::BT: return combineBT(N, DAG, DCI); - case X86ISD::VZEXT_MOVL: return combineVZextMovl(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); @@ -31534,6 +31552,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: + case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case ISD::FMA: return combineFMA(N, DAG, Subtarget); case ISD::MGATHER: diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll index b55a029ea65..e801334ec71 100644 --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -112,14 +112,12 @@ define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) { define <4 x float> @shuffle_v4f32_05zz(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: shuffle_v4f32_05zz: ; SSE: # BB#0: -; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_05zz: ; AVX: # BB#0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> %shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll index bdc490f461c..cec0402c464 100644 --- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll @@ -9,7 +9,8 @@ define i16 @test1(float %f) nounwind { ; X32-NEXT: xorps %xmm1, %xmm1 ; X32-NEXT: subss LCPI0_0, %xmm0 ; X32-NEXT: mulss LCPI0_1, %xmm0 -; X32-NEXT: minss LCPI0_2, %xmm0 +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: minss %xmm2, %xmm0 ; X32-NEXT: maxss %xmm1, %xmm0 ; X32-NEXT: cvttss2si %xmm0, %eax ; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> @@ -21,7 +22,8 @@ define i16 @test1(float %f) nounwind { ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: subss {{.*}}(%rip), %xmm0 ; X64-NEXT: mulss {{.*}}(%rip), %xmm0 -; X64-NEXT: minss {{.*}}(%rip), %xmm0 +; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: minss %xmm2, %xmm0 ; X64-NEXT: maxss %xmm1, %xmm0 ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index e64ca967eaa..f53d30a1700 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2176,29 +2176,18 @@ define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) { } define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) { -; SSE2-LABEL: insert_dup_mem_v8i16_i32: -; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_mem_v8i16_i32: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_mem_v8i16_i32: -; SSE41: # BB#0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_mem_v8i16_i32: +; SSE: # BB#0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i32: ; AVX1: # BB#0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_mem_v8i16_i32: @@ -2257,29 +2246,18 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { } define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) { -; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32: -; SSE2: # BB#0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt1_mem_v8i16_i32: -; SSE41: # BB#0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32: +; SSE: # BB#0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; AVX1: # BB#0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index c7d9ca86102..29715e07a81 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3564,7 +3564,9 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX1: # BB#0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 6e8fc5ef194..12ce9a2b1df 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2814,31 +2814,12 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { ret <4 x float> %d } -; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { -; SSE2-LABEL: combine_scalar_load_with_blend_with_zero: -; SSE2: # BB#0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movaps %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero: -; SSSE3: # BB#0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: movaps %xmm0, (%rsi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: combine_scalar_load_with_blend_with_zero: -; SSE41: # BB#0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: movapd %xmm0, (%rsi) -; SSE41-NEXT: retq +; SSE-LABEL: combine_scalar_load_with_blend_with_zero: +; SSE: # BB#0: +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movapd %xmm0, (%rsi) +; SSE-NEXT: retq ; ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: ; AVX: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index e8d9aa20491..bd59328aaf8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -41,7 +41,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: andq $-32, %rsp ; ALL-NEXT: subq $64, %rsp ; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: movq %rbp, %rsp |