diff options
author | Sanjay Patel <spatel@rotateright.com> | 2018-09-25 19:09:34 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2018-09-25 19:09:34 +0000 |
commit | 10c11b867a045ee6f5ada9b741dbf2929d8d7ae0 (patch) | |
tree | 153d88a3398b84c933d3581a439842224b30d0c1 | |
parent | 7c18d6083ab5e0ba0c9b720be5f4b7fc80b9c958 (diff) | |
download | bcm5719-llvm-10c11b867a045ee6f5ada9b741dbf2929d8d7ae0.tar.gz bcm5719-llvm-10c11b867a045ee6f5ada9b741dbf2929d8d7ae0.zip |
[x86] avoid 256-bit andnp that requires insert/extract with AVX1 (PR37449)
This is the final (I hope!) problem pattern mentioned in PR37749:
https://bugs.llvm.org/show_bug.cgi?id=37749
We are trying to avoid an AVX1 sinkhole caused by having 256-bit bitwise logic ops but no other 256-bit integer ops.
We've already solved the simple logic ops, but 'andn' is an x86 special. I looked at alternative solutions like
extending the generic DAG combine or trying to wait until the ANDNP node is created, but those are bigger patches
that can over-reach. Ie, splitting to 128-bit does not look like a win in most cases with >1 256-bit op.
The pattern matching is cluttered with bitcasts because of our i64 element canonicalization. For the affected test,
we have this vector-type-legalized sequence:
t29: v8i32 = concat_vectors t27, t28
t30: v4i64 = bitcast t29
t18: v8i32 = BUILD_VECTOR Constant:i32<-1>, Constant:i32<-1>, ...
t31: v4i64 = bitcast t18
t32: v4i64 = xor t30, t31
t9: v8i32 = BUILD_VECTOR Constant:i32<255>, Constant:i32<255>, ...
t34: v4i64 = bitcast t9
t35: v4i64 = and t32, t34
t36: v8i32 = bitcast t35
t37: v4i32 = extract_subvector t36, Constant:i64<0>
t38: v4i32 = extract_subvector t36, Constant:i64<4>
Differential Revision: https://reviews.llvm.org/D52318
llvm-svn: 343008
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 31 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-logic.ll | 18 |
3 files changed, 44 insertions, 7 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0a3744ebdcc..00c6c446d27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8205,7 +8205,7 @@ SDValue llvm::peekThroughOneUseBitcasts(SDValue V) { bool llvm::isBitwiseNot(SDValue V) { if (V.getOpcode() != ISD::XOR) return false; - ConstantSDNode *C = isConstOrConstSplat(V.getOperand(1)); + ConstantSDNode *C = isConstOrConstSplat(peekThroughBitcasts(V.getOperand(1))); return C && C->isAllOnesValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 54e9098ea54..7fc37718409 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40168,6 +40168,37 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + // For AVX1 only, if we are extracting from a 256-bit and+not (which will + // eventually get combined/lowered into ANDNP) with a concatenated operand, + // split the 'and' into 128-bit ops to avoid the concatenate and extract. + // We let generic combining take over from there to simplify the + // insert/extract and 'not'. + // This pattern emerges during AVX1 legalization. We handle it before lowering + // to avoid complications like splitting constant vector loads. + + // Capture the original wide type in the likely case that we need to bitcast + // back to this type. + EVT VT = N->getValueType(0); + EVT WideVecVT = N->getOperand(0).getValueType(); + SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); + if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && WideVecVT.isSimple() && + WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { + auto isConcatenatedNot = [] (SDValue V) { + V = peekThroughBitcasts(V); + if (!isBitwiseNot(V)) + return false; + SDValue NotOp = V->getOperand(0); + return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; + }; + if (isConcatenatedNot(WideVec.getOperand(0)) || + isConcatenatedNot(WideVec.getOperand(1))) { + // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 + SDValue Concat = split256IntArith(WideVec, DAG); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, + DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll index 379a9751c2f..0fe5cbacc84 100644 --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -342,9 +342,9 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 @@ -364,6 +364,8 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ret <8 x i32> %add1 } +; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable. + define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { ; AVX1-LABEL: andn_variable_mask_operand_no_concat: ; AVX1: # %bb.0: @@ -386,6 +388,8 @@ define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> % ret <8 x i32> %add } +; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable (even if the mask is a constant). + define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y) { ; AVX1-LABEL: andn_constant_mask_operand_no_concat: ; AVX1: # %bb.0: @@ -408,6 +412,8 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> % ret <8 x i32> %r } +; This is a close call, but we split the 'andn' to reduce the insert/extract. + define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { ; AVX1-LABEL: andn_variable_mask_operand_concat: ; AVX1: # %bb.0: @@ -415,9 +421,9 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 |