From ec13ebf2c83118f83ca4f35fd938c0a5bcfdd38c Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 26 May 2017 15:33:18 +0000 Subject: [DAGCombiner] use narrow vector ops to eliminate concat/extract (PR32790) In the best case: extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN ...we kill all of the extract/concat and just have narrow binops remaining. If only one of the binop operands is amenable, this transform is still worthwhile because we kill some of the extract/concat. Optional bitcasting makes the code more complicated, but there doesn't seem to be a way to avoid that. The TODO about extending to more than bitwise logic is there because we really will regress several x86 tests including madd, psad, and even a plain integer-multiply-by-2 or shift-left-by-1. I don't think there's anything fundamentally wrong with this patch that would cause those regressions; those folds are just missing or brittle. If we extend to more binops, I found that this patch will fire on at least one non-x86 regression test. There's an ARM NEON test in test/CodeGen/ARM/coalesce-subregs.ll with a pattern like: t5: v2f32 = vector_shuffle<0,3> t2, t4 t6: v1i64 = bitcast t5 t8: v1i64 = BUILD_VECTOR Constant:i64<0> t9: v2i64 = concat_vectors t6, t8 t10: v4f32 = bitcast t9 t12: v4f32 = fmul t11, t10 t13: v2i64 = bitcast t12 t16: v1i64 = extract_subvector t13, Constant:i32<0> There was no functional change in the codegen from this transform from what I could see though. For the x86 test changes: 1. PR32790() is the closest call. We don't reduce the AVX1 instruction count in that case, but we improve throughput. Also, on a core like Jaguar that double-pumps 256-bit ops, there's an unseen win because two 128-bit ops have the same cost as the wider 256-bit op. SSE/AVX2/AXV512 are not affected which is expected because only AVX1 has the extract/concat ops to match the pattern. 2. do_not_use_256bit_op() is the best case. Everyone wins by avoiding the concat/extract. Related bug for IR filed as: https://bugs.llvm.org/show_bug.cgi?id=33026 3. The SSE diffs in vector-trunc-math.ll are just scheduling/RA, so nothing real AFAICT. 4. The AVX1 diffs in vector-tzcnt-256.ll are all the same pattern: we reduced the instruction count by one in each case by eliminating two insert/extract while adding one narrower logic op. https://bugs.llvm.org/show_bug.cgi?id=32790 Differential Revision: https://reviews.llvm.org/D33137 llvm-svn: 303997 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 96 +++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp') diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8d247c89a0a..6b9ab714e67 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14462,6 +14462,99 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +/// If we are extracting a subvector produced by a wide binary operator with at +/// at least one operand that was the result of a vector concatenation, then try +/// to use the narrow vector operands directly to avoid the concatenation and +/// extraction. +static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { + // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share + // some of these bailouts with other transforms. + + // The extract index must be a constant, so we can map it to a concat operand. + auto *ExtractIndex = dyn_cast(Extract->getOperand(1)); + if (!ExtractIndex) + return SDValue(); + + // Only handle the case where we are doubling and then halving. A larger ratio + // may require more than two narrow binops to replace the wide binop. + EVT VT = Extract->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + assert((ExtractIndex->getZExtValue() % NumElems) == 0 && + "Extract index is not a multiple of the vector length."); + if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) + return SDValue(); + + // We are looking for an optionally bitcasted wide vector binary operator + // feeding an extract subvector. + SDValue BinOp = Extract->getOperand(0); + if (BinOp.getOpcode() == ISD::BITCAST) + BinOp = BinOp.getOperand(0); + + // TODO: The motivating case for this transform is an x86 AVX1 target. That + // target has temptingly almost legal versions of bitwise logic ops in 256-bit + // flavors, but no other 256-bit integer support. This could be extended to + // handle any binop, but that may require fixing/adding other folds to avoid + // codegen regressions. + unsigned BOpcode = BinOp.getOpcode(); + if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) + return SDValue(); + + // The binop must be a vector type, so we can chop it in half. + EVT WideBVT = BinOp.getValueType(); + if (!WideBVT.isVector()) + return SDValue(); + + // Bail out if the target does not support a narrower version of the binop. + EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), + WideBVT.getVectorNumElements() / 2); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) + return SDValue(); + + // Peek through bitcasts of the binary operator operands if needed. + SDValue LHS = BinOp.getOperand(0); + if (LHS.getOpcode() == ISD::BITCAST) + LHS = LHS.getOperand(0); + + SDValue RHS = BinOp.getOperand(1); + if (RHS.getOpcode() == ISD::BITCAST) + RHS = RHS.getOperand(0); + + // We need at least one concatenation operation of a binop operand to make + // this transform worthwhile. The concat must double the input vector sizes. + // TODO: Should we also handle INSERT_SUBVECTOR patterns? + bool ConcatL = + LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; + bool ConcatR = + RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; + if (!ConcatL && !ConcatR) + return SDValue(); + + // If one of the binop operands was not the result of a concat, we must + // extract a half-sized operand for our new narrow binop. We can't just reuse + // the original extract index operand because we may have bitcasted. + unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; + unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); + EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); + SDLoc DL(Extract); + + // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN + // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) + // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN + SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(0), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) + : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, + BinOp.getOperand(1), + DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); + + SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); + return DAG.getBitcast(VT, NarrowBinOp); +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -14517,6 +14610,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { } } + if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) + return NarrowBOp; + return SDValue(); } -- cgit v1.2.3