diff options
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 168 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 4 |
3 files changed, 176 insertions, 4 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 22935c31aba..e06e6fbcf52 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1779,6 +1779,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -19853,6 +19854,36 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -25347,6 +25378,132 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext <N x i8> %a to <N x i32> + // %2 = zext <N x i8> %b to <N x i32> + // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> + // %4 = add nuw nsw <N x i32> %3, %2 + // %5 = lshr <N x i32> %N, <i32 1 x N> + // %6 = trunc <N x i32> %5 to <N x i8> + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget, + SDLoc(N)); +} + /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -25611,6 +25768,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -26873,6 +27040,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index fd829e1b189..dd3ab69fc05 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4046,6 +4046,10 @@ defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4062,10 +4066,6 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 4bdb5b9146e..0b77d480bc0 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -250,6 +250,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1699,6 +1701,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), |

