diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 56 |
2 files changed, 93 insertions, 2 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 71bab13c427..a47a6669849 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41144,6 +41144,41 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + // Convert a full vector load into vzload when not all bits are needed. + SDValue In = N->getOperand(0); + MVT InVT = In.getSimpleValueType(); + if (VT.getVectorNumElements() < InVT.getVectorNumElements() && + ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { + assert(InVT.is128BitVector() && "Expected 128-bit input vector"); + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + // Unless the load is volatile. + if (!LN->isVolatile()) { + SDLoc dl(N); + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getFloatingPointVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, + LN->getPointerInfo(), + LN->getAlignment(), + LN->getMemOperand()->getFlags()); + SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + return SDValue(N, 0); + } + } + + return SDValue(); +} + /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -43940,6 +43975,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); + case X86ISD::CVTP2SI: + case X86ISD::CVTP2UI: + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 2c9b6f127a5..0b3d0f59cc6 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -7979,7 +7979,11 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, sched.YMM>, EVEX_V256; } @@ -7997,7 +8001,11 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + sched.XMM, "{1to2}", "", f64mem, VK2WM, + (v2i64 (OpNode (bc_v4f32 + (v2f64 + (scalar_to_vector (loadf64 addr:$src))))))>, + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, sched.YMM>, EVEX_V256; } @@ -8358,6 +8366,50 @@ let Predicates = [HasDQI] in { } let Predicates = [HasDQI, HasVLX] in { + def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + (VCVTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + (VCVTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + (VCVTTPS2QQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; + + def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src))))), + (VCVTTPS2UQQZ128rm addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + VR128X:$src0)), + (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; + def : Pat<(v2i64 (vselect VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))), + v2i64x_info.ImmAllZerosV)), + (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; + def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))), (VCVTTPS2QQZ256rr VR128X:$src)>; def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))), |