From beec41c656e7d716fd5755cce12e4934fdced267 Mon Sep 17 00:00:00 2001 From: "Luo, Yuanke" Date: Mon, 6 May 2019 08:22:37 +0000 Subject: Enable AVX512_BF16 instructions, which are supported for BFLOAT16 in Cooper Lake Summary: 1. Enable infrastructure of AVX512_BF16, which is supported for BFLOAT16 in Cooper Lake; 2. Enable VCVTNE2PS2BF16, VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector Neural Network Instructions supporting BFLOAT16 inputs and conversion instructions from IEEE single precision. VCVTNE2PS2BF16: Convert Two Packed Single Data to One Packed BF16 Data. VCVTNEPS2BF16: Convert Packed Single Data to Packed BF16 Data. VDPBF16PS: Dot Product of BF16 Pairs Accumulated into Packed Single Precision. For more details about BF16 isa, please refer to the latest ISE document: https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference Author: LiuTianle Reviewers: craig.topper, smaslov, LuoYuanke, wxiao3, annita.zhang, RKSimon, spatel Reviewed By: craig.topper Subscribers: kristina, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60550 llvm-svn: 360017 --- llvm/lib/Support/Host.cpp | 3 + llvm/lib/Target/X86/X86.td | 3 + llvm/lib/Target/X86/X86ISelLowering.cpp | 19 ++++ llvm/lib/Target/X86/X86ISelLowering.h | 13 +++ llvm/lib/Target/X86/X86InstrAVX512.td | 140 +++++++++++++++++++++++++++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 19 ++++ llvm/lib/Target/X86/X86InstrInfo.td | 1 + llvm/lib/Target/X86/X86IntrinsicsInfo.h | 11 +++ llvm/lib/Target/X86/X86Subtarget.h | 4 + 9 files changed, 213 insertions(+) (limited to 'llvm/lib') diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 69362704687..4a7eff3f6e3 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1375,6 +1375,9 @@ bool sys::getHostCPUFeatures(StringMap &Features) { // detecting features using the "-march=native" flag. // For more info, see X86 ISA docs. Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1); + bool HasLeaf7Subleaf1 = + MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX); + Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save; bool HasLeafD = MaxLevel >= 0xd && !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index fe23a2900d5..a799c1fda49 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -167,6 +167,9 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", "Enable AVX-512 Vector Neural Network Instructions", [FeatureAVX512]>; +def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true", + "Support bfloat16 floating point", + [FeatureBWI]>; def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", "Enable AVX-512 Bit Algorithms", [FeatureBWI]>; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 053fe90d1f4..fc100fe4871 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22624,6 +22624,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, PassThru, Mask); } + case CVTNEPS2BF16_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); + + // Break false dependency. + if (PassThru.isUndef()) + PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); + + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, + Mask); + } default: break; } @@ -28073,6 +28088,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; + case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; + case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; + case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; + case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 78e414b9fe8..9b2f059ae6b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -509,6 +509,19 @@ namespace llvm { MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, MCVTSI2P, MCVTUI2P, + // Vector float to bfloat16. + // Convert TWO packed single data to one packed BF16 data + CVTNE2PS2BF16, + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16 pairs to accumulated into + // packed single precision. + DPBF16PS, + // Save xmm argument registers to the stack, according to %al. An operator // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index d857b8ef1dc..d0d255b6a7f 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12647,3 +12647,143 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, Sched<[SchedWriteFMA.ZMM.Folded]>; } +multiclass avx512_binop_all2 opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _SrcVTInfo, + AVX512VLVectorVTInfo _DstVTInfo, + SDNode OpNode, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm NAME#Z : avx512_binop_rm2, + EVEX_V512, EVEX_CD8<32, CD8VF>; + let Predicates = [HasVLX, prd] in { + defm NAME#Z256 : avx512_binop_rm2, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME#Z128 : avx512_binop_rm2, + EVEX_V128, EVEX_CD8<32, CD8VF>; + } +} + +defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", + SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF + avx512vl_f32_info, avx512vl_i16_info, + X86cvtne2ps2bf16, HasBF16, 0>, T8XD; + +// Truncate Float to BFloat16 +multiclass avx512_cvtps2bf16 opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasBF16] in { + defm Z : avx512_vcvt_fp, EVEX_V512; + } + let Predicates = [HasBF16, HasVLX] in { + defm Z128 : avx512_vcvt_fp, EVEX_V128; + defm Z256 : avx512_vcvt_fp, EVEX_V256; + + def : InstAlias(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0>; + def : InstAlias(NAME # "Z128rm") VR128X:$dst, + f128mem:$src), 0, "intel">; + def : InstAlias(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0>; + def : InstAlias(NAME # "Z256rm") VR128X:$dst, + f256mem:$src), 0, "intel">; + } +} + +defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16", + SchedWriteCvtPD2PS>, T8XS, + EVEX_CD8<32, CD8VF>; + +let Predicates = [HasBF16, HasVLX] in { + // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction + // patterns have been disabled with null_frag. + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), + (VCVTNEPS2BF16Z128rr VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), + (VCVTNEPS2BF16Z128rm addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0), + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV, + VK4WM:$mask), + (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; + + def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 + (X86VBroadcast (loadf32 addr:$src))))), + (VCVTNEPS2BF16Z128rmb addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + (v8i16 VR128X:$src0), VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + v8i16x_info.ImmAllZerosV, VK4WM:$mask), + (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; +} + +let Constraints = "$src1 = $dst" in { +multiclass avx512_dpbf16ps_rm opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo src_v> { + defm r: AVX512_maskable_3src, + EVEX_4V; + + defm m: AVX512_maskable_3src, EVEX_4V; + + defm mb: AVX512_maskable_3src, + EVEX_B, EVEX_4V; + +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_dpbf16ps_sizes opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo src_v, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_dpbf16ps_rm, EVEX_V512; + } + let Predicates = [HasVLX, prd] in { + defm Z256 : avx512_dpbf16ps_rm, EVEX_V256; + defm Z128 : avx512_dpbf16ps_rm, EVEX_V128; + } +} + +defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, + avx512vl_f32_info, avx512vl_i32_info, + HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 4d4d5faccdd..d79959e6455 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -664,6 +664,25 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", SDTCisOpSmallerThanOp<0, 1>, SDTCisVT<2, i32>]>>; +// cvt fp to bfloat16 +def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; +def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<1, 3>]>>; +def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>]>>; +def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0,1>, + SDTCVecEltisVT<2, i32>, + SDTCisSameAs<2,3>]>>; + // galois field arithmetic def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 0176c2d707a..56bc0500078 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -835,6 +835,7 @@ def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">; def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 54168762c7a..40141d89462 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -19,6 +19,7 @@ namespace llvm { enum IntrinsicType : uint16_t { + CVTNEPS2BF16_MASK, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, @@ -981,6 +982,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), + // bfloat16 + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), + X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 0ff9d544d82..3b11bb12f62 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -353,6 +353,9 @@ protected: /// Processor has AVX-512 Vector Neural Network Instructions bool HasVNNI = false; + /// Processor has AVX-512 bfloat16 floating-point extensions + bool HasBF16 = false; + /// Processor has AVX-512 Bit Algorithms instructions bool HasBITALG = false; @@ -668,6 +671,7 @@ public: bool hasVLX() const { return HasVLX; } bool hasPKU() const { return HasPKU; } bool hasVNNI() const { return HasVNNI; } + bool hasBF16() const { return HasBF16; } bool hasBITALG() const { return HasBITALG; } bool hasMPX() const { return HasMPX; } bool hasSHSTK() const { return HasSHSTK; } -- cgit v1.2.3