diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-04-30 19:08:16 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-04-30 19:08:16 +0000 |
commit | 0084adc5165622ea838f9af1e5a0559cd128b483 (patch) | |
tree | a24f460232a0d35c1eca0b9045b4a22645aa2866 | |
parent | 45c7205b617895a96073da30aabfdbae2bb8651c (diff) | |
download | bcm5719-llvm-0084adc5165622ea838f9af1e5a0559cd128b483.tar.gz bcm5719-llvm-0084adc5165622ea838f9af1e5a0559cd128b483.zip |
AMDGPU: Add Vega12 and Vega20
Changes by
Matt Arsenault
Konstantin Zhuravlyov
llvm-svn: 331215
43 files changed, 2133 insertions, 322 deletions
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 886c378b21b..7d358dfdce4 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -200,6 +200,16 @@ names from both the *Processor* and *Alternative Processor* can be used. - Radeon Instinct MI25 ``gfx902`` ``amdgcn`` APU - xnack - Ryzen 3 2200G [on] - Ryzen 5 2400G + ``gfx904`` ``amdgcn`` dGPU - xnack *TBA* + [off] + .. TODO + Add product + names. + ``gfx906`` ``amdgcn`` dGPU - xnack *TBA* + [off] + .. TODO + Add product + names. =========== =============== ============ ===== ========= ======= ================== .. _amdgpu-target-features: @@ -547,8 +557,8 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX810`` 0x02b ``gfx810`` ``EF_AMDGPU_MACH_AMDGCN_GFX900`` 0x02c ``gfx900`` ``EF_AMDGPU_MACH_AMDGCN_GFX902`` 0x02d ``gfx902`` - *reserved* 0x02e Reserved. - *reserved* 0x02f Reserved. + ``EF_AMDGPU_MACH_AMDGCN_GFX904`` 0x02e ``gfx904`` + ``EF_AMDGPU_MACH_AMDGCN_GFX906`` 0x02f ``gfx906`` *reserved* 0x030 Reserved. ================================= ========== ============================= @@ -765,7 +775,7 @@ The following relocation types are supported: ``R_AMDGPU_ABS32_HI`` Static, 2 ``word32`` (S + A) >> 32 Dynamic ``R_AMDGPU_ABS64`` Static, 3 ``word64`` S + A - Dynamic + Dynamic ``R_AMDGPU_REL32`` Static 4 ``word32`` S + A - P ``R_AMDGPU_REL64`` Static 5 ``word64`` S + A - P ``R_AMDGPU_ABS32`` Static, 6 ``word32`` S + A @@ -784,7 +794,7 @@ the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``. There is no current OS loader support for 32 bit programs and so ``R_AMDGPU_ABS32`` is not used. - + .. _amdgpu-dwarf: DWARF diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 4651e518c4b..7e81c7130f7 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -687,7 +687,7 @@ enum : unsigned { // AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = 0x020, - EF_AMDGPU_MACH_AMDGCN_LAST = 0x02d, + EF_AMDGPU_MACH_AMDGCN_LAST = 0x02f, // AMDGCN GFX6. EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, @@ -705,12 +705,12 @@ enum : unsigned { // AMDGCN GFX9. EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, // Reserved for AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027, - EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x02e, - EF_AMDGPU_MACH_AMDGCN_RESERVED2 = 0x02f, - EF_AMDGPU_MACH_AMDGCN_RESERVED3 = 0x030, + EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030, // Indicates if the xnack target feature is enabled for all code contained in // the object. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 731ad66829f..f51df9f16bf 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1287,6 +1287,109 @@ def int_amdgcn_ds_bpermute : GCCBuiltin<"__builtin_amdgcn_ds_bpermute">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +//===----------------------------------------------------------------------===// +// Deep learning intrinsics. +//===----------------------------------------------------------------------===// + +// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_fdot2 : + GCCBuiltin<"__builtin_amdgcn_fdot2">, + Intrinsic< + [llvm_float_ty], // %r + [ + llvm_v2f16_ty, // %a + llvm_v2f16_ty, // %b + llvm_float_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_sdot2 : + GCCBuiltin<"__builtin_amdgcn_sdot2">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_v2i16_ty, // %a + llvm_v2i16_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_udot2 : + GCCBuiltin<"__builtin_amdgcn_udot2">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_v2i16_ty, // %a + llvm_v2i16_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c +def int_amdgcn_sdot4 : + GCCBuiltin<"__builtin_amdgcn_sdot4">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i32_ty, // %a + llvm_i32_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c +def int_amdgcn_udot4 : + GCCBuiltin<"__builtin_amdgcn_udot4">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i32_ty, // %a + llvm_i32_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + +// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c +def int_amdgcn_sdot8 : + GCCBuiltin<"__builtin_amdgcn_sdot8">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i32_ty, // %a + llvm_i32_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; + +// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + +// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c +def int_amdgcn_udot8 : + GCCBuiltin<"__builtin_amdgcn_udot8">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i32_ty, // %a + llvm_i32_ty, // %b + llvm_i32_ty // %c + ], + [IntrNoMem, IntrSpeculatable] + >; //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 642a89fe930..53664906847 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -400,6 +400,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX810, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH); BCase(EF_AMDGPU_XNACK); break; case ELF::EM_X86_64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 37c593d951e..73490f6c421 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -127,6 +127,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" >; +def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", + "HasFmaMixInsts", + "true", + "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -310,6 +316,12 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", "Has unpacked d16 vmem instructions" >; +def FeatureDLInsts : SubtargetFeature<"dl-insts", + "HasDLInsts", + "true", + "Has deep learning instructions" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -606,6 +618,18 @@ def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, FeatureXNACK ]>; +def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, + [FeatureGFX9, + FeatureLDSBankCount32, + FeatureFmaMixInsts]>; + +def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, + [FeatureGFX9, + HalfRate64Ops, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts]>; + //===----------------------------------------------------------------------===// // Debugger related subtarget features. //===----------------------------------------------------------------------===// @@ -788,6 +812,13 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, def HasMovrel : Predicate<"Subtarget->hasMovrel()">, AssemblerPredicate<"FeatureMovrel">; +def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, + AssemblerPredicate<"FeatureFmaMixInsts">; + +def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, + AssemblerPredicate<"FeatureDLInsts">; + + def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 1b98edcf95b..47321a76e5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -215,7 +215,7 @@ private: void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); - void SelectFMAD(SDNode *N); + void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); protected: @@ -621,7 +621,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectBRCOND(N); return; case ISD::FMAD: - SelectFMAD(N); + case ISD::FMA: + SelectFMAD_FMA(N); return; case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); @@ -1728,9 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { VCC.getValue(0)); } -void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { MVT VT = N->getSimpleValueType(0); - if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) { + bool IsFMA = N->getOpcode() == ISD::FMA; + if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && + !Subtarget->hasFmaMixInsts()) || + ((IsFMA && Subtarget->hasMadMixInsts()) || + (!IsFMA && Subtarget->hasFmaMixInsts()))) { SelectCode(N); return; } @@ -1740,13 +1745,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { SDValue Src2 = N->getOperand(2); unsigned Src0Mods, Src1Mods, Src2Mods; - // Avoid using v_mad_mix_f32 unless there is actually an operand using the - // conversion from f16. + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert(!Subtarget->hasFP32Denormals() && + assert((IsFMA || !Subtarget->hasFP32Denormals()) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). @@ -1762,7 +1767,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { Zero, Zero }; - CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops); + CurDAG->SelectNodeTo(N, + IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, + MVT::f32, Ops); } else { SelectCode(N); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0ede62ba4d4..c60e25390c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -939,7 +939,8 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { // where this is OK to use. bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && + return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && SrcVT.getScalarType() == MVT::f16; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 07ed04e41d7..b3b485e548b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -148,6 +148,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasIntClamp(false), HasVOP3PInsts(false), HasMadMixInsts(false), + HasFmaMixInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), @@ -160,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDLInsts(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 3bcb701af15..996ae9c2f0b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -72,7 +72,10 @@ public: ISAVersion8_0_3, ISAVersion8_1_0, ISAVersion9_0_0, - ISAVersion9_0_2 + ISAVersion9_0_1, + ISAVersion9_0_2, + ISAVersion9_0_4, + ISAVersion9_0_6 }; enum TrapHandlerAbi { @@ -150,6 +153,7 @@ protected: bool HasIntClamp; bool HasVOP3PInsts; bool HasMadMixInsts; + bool HasFmaMixInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; @@ -162,6 +166,7 @@ protected: bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDLInsts; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; @@ -329,6 +334,10 @@ public: return HasMadMixInsts; } + bool hasFmaMixInsts() const { + return HasFmaMixInsts; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } @@ -534,6 +543,10 @@ public: return getGeneration() < SEA_ISLANDS; } + bool hasDLInsts() const { + return HasDLInsts; + } + /// \brief Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d22a0c90c0d..a249c99f7a7 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4601,12 +4601,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - // special case v_mac_{f16, f32}: + // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906): // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers - // should be 0 - if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi || - Opc == AMDGPU::V_MAC_F16_e64_vi) { + // should be 0. + if (Opc == AMDGPU::V_MAC_F32_e64_si || + Opc == AMDGPU::V_MAC_F32_e64_vi || + Opc == AMDGPU::V_MAC_F16_e64_vi || + Opc == AMDGPU::V_FMAC_F32_e64_vi) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b51c23fd688..7277a81f2e5 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -201,7 +201,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); - if (Res) break; + if (Res) + break; + } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) + break; } } diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index c4f585d5140..d76acfa24f9 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -148,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel, def : ProcessorModel<"gfx902", SIQuarterSpeedModel, [FeatureISAVersion9_0_2] >; + +def : ProcessorModel<"gfx904", SIQuarterSpeedModel, + [FeatureISAVersion9_0_4] +>; + +def : ProcessorModel<"gfx906", SIQuarterSpeedModel, + [FeatureISAVersion9_0_6] +>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 728e5ab8bb2..caed0a16d42 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -95,6 +95,8 @@ unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const { // AMDGCN GFX9. .Case("gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900) .Case("gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902) + .Case("gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904) + .Case("gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906) // Not specified processor. .Default(ELF::EF_AMDGPU_MACH_NONE); } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 481eb741af3..e4f121368a4 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -127,14 +127,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, unsigned Opc = UseMI.getOpcode(); switch (Opc) { case AMDGPU::V_MAC_F32_e64: - case AMDGPU::V_MAC_F16_e64: { + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; - const MCInstrDesc &MadDesc - = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + + unsigned Opc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } return false; @@ -224,13 +228,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + unsigned NewOpc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. - MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); + MI->setDesc(TII->get(NewOpc)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 925fdce757a..ebc686ca83d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3272,12 +3272,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: + case MVT::f32: { // This is as fast on some subtargets. However, we always have full rate f32 // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + if (Subtarget->hasFP32Denormals()) + return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); + + // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. + return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); + } case MVT::f64: return true; case MVT::f16: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f4ff718e42d..897ffa948e2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2161,20 +2161,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) { MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + unsigned Opc = MI.getOpcode(); bool IsF16 = false; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; - switch (MI.getOpcode()) { + switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; - case AMDGPU::V_MAC_F32_e32: { + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_FMAC_F32_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); @@ -2199,7 +2203,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { @@ -2230,8 +2234,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) + assert((!IsFMA || !IsF16) && "fmac only expected with f32"); + unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : + (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) .add(*Src0) @@ -4048,17 +4054,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) - .add(Src1); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (ST.hasDLInsts()) { + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) + .add(Src0) + .add(Src1); + } else { + unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) + .add(Src0) + .add(Src1); - unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not) - .addReg(Xor); + BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) + .addReg(Xor); + } - MRI.replaceRegWith(Dest.getReg(), Not); - addUsersToMoveToVALUWorklist(Not, MRI, Worklist); + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } void SIInstrInfo::splitScalar64BitUnaryOp( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8797253f183..ef7fbfba416 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1934,6 +1934,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; +def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; +def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 415806bcb20..8080151d6d9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1367,6 +1367,16 @@ def : GCNPat< >; } +let OtherPredicates = [HasDLInsts] in { +def : GCNPat < + (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f32 (VOP3NoMods f32:$src2))), + (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; +} // End OtherPredicates = [HasDLInsts] + // Allow integer inputs class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index a346e409cb3..61cbba4c8ae 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -100,6 +100,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: if (!isVGPR(Src2, TRI, MRI) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 706f0171dbd..8f687fdc60a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -221,6 +221,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {9, 0, 0}; if (Features.test(FeatureISAVersion9_0_2)) return {9, 0, 2}; + if (Features.test(FeatureISAVersion9_0_4)) + return {9, 0, 4}; + if (Features.test(FeatureISAVersion9_0_6)) + return {9, 0, 6}; if (Features.test(FeatureGFX9)) return {9, 0, 0}; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index b012dd3506e..5ec1a15c5cd 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -491,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = HasDLInsts in { + +defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>; + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; +} + +} // End SubtargetPredicate = HasDLInsts + // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -944,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; } // End SubtargetPredicate = isVI + +let SubtargetPredicate = HasDLInsts in { + +defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>; +defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; + +} // End SubtargetPredicate = HasDLInsts diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index d2530c45459..3127532a8e0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; +multiclass MadFmaMixPats<SDPatternOperator fma_like, + Instruction mix_inst, + Instruction mixlo_inst, + Instruction mixhi_inst> { + def : GCNPat < + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + // FIXME: Special case handling for maxhi (especially for clamp) + // because dealing with the write to high half of the register is + // difficult. + def : GCNPat < + (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) + >; + + def : GCNPat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) + >; + + def : GCNPat < + (AMDGPUclamp (build_vector + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (mixlo_inst $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) + >; +} let SubtargetPredicate = HasMadMixInsts in { // These are VOP3a-like opcodes which accept no omod. @@ -84,64 +145,37 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16 } } -def : GCNPat < - (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (V_MAD_MIXLO_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (i32 (IMPLICIT_DEF))) ->; +defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; +} // End SubtargetPredicate = HasMadMixInsts -// FIXME: Special case handling for maxhi (especially for clamp) -// because dealing with the write to high half of the register is -// difficult. -def : GCNPat < - (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - $elt0)) ->; -def : GCNPat < - (build_vector - f16:$elt0, - (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.ENABLE, - $elt0)) ->; +// Essentially the same as the mad_mix versions +let SubtargetPredicate = HasFmaMixInsts in { +let isCommutable = 1 in { +def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; -def : GCNPat < - (AMDGPUclamp (build_vector - (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), - (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, - $hi_src1_modifiers, $hi_src1, - $hi_src2_modifiers, $hi_src2, - DSTCLAMP.ENABLE, - (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, - $lo_src1_modifiers, $lo_src1, - $lo_src2_modifiers, $lo_src2, - DSTCLAMP.ENABLE, - (i32 (IMPLICIT_DEF))))) ->; +// Clamp modifier is applied after conversion to f16. +def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in { +def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +} +} + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; +} -} // End SubtargetPredicate = [HasMadMixInsts] +let SubtargetPredicate = HasDLInsts in { + +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, int_amdgcn_fdot2>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>; +def V_DOT4_I32_I8 : VOP3Inst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot4>; +def V_DOT4_U32_U8 : VOP3Inst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot4>; +def V_DOT8_I32_I4 : VOP3Inst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sdot8>; +def V_DOT8_U32_U4 : VOP3Inst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_udot8>; + +} // End SubtargetPredicate = HasDLInsts multiclass VOP3P_Real_vi<bits<10> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +let SubtargetPredicate = HasMadMixInsts in { defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} + +let SubtargetPredicate = HasFmaMixInsts in { +let DecoderNamespace = "GFX9_DL" in { +// The mad_mix instructions were renamed and their behaviors changed, +// but the opcode stayed the same so we need to put these in a +// different DecoderNamespace to avoid the ambiguity. +defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} +} + + +let SubtargetPredicate = HasDLInsts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; +defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; +defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; +defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; +defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; +defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; +defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; + +} // End SubtargetPredicate = HasDLInsts diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index b351a824758..9d2d3690995 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -44,6 +44,8 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=stoney < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX810 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx900 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX900 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX902 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx904 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX904 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s ; ARCH-R600: Arch: r600 ; ARCH-GCN: Arch: amdgcn @@ -81,6 +83,8 @@ ; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) ; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) ; GFX902-NEXT: EF_AMDGPU_XNACK (0x100) +; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) +; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; ALL: ] define amdgpu_kernel void @elf_header() { diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll index 8e51f82112f..68e25ee7f4b 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.ll @@ -1,4 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: not llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cedar -verify-machineinstrs < %s ; RUN: not llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=juniper -verify-machineinstrs < %s @@ -16,6 +17,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone ; FUNC-LABEL: {{^}}fma_f32: ; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG: FMA {{\*? *}}[[RES]] @@ -29,10 +31,20 @@ define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* ret void } +; GCN-LABEL: {{^}}fmac_to_3addr_f32: +; GCN: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +define float @fmac_to_3addr_f32(float %r0, float %r1, float %r2) { + %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) + ret float %r3 +} + ; FUNC-LABEL: {{^}}fma_v2f32: ; SI: v_fma_f32 ; SI: v_fma_f32 +; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] @@ -51,6 +63,10 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> ; SI: v_fma_f32 ; SI: v_fma_f32 ; SI: v_fma_f32 +; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}} +; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, ; EG-DAG: FMA {{\*? *}}[[RES]].X @@ -97,3 +113,34 @@ define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %ou store float %fma, float addrspace(1)* %out.gep, align 4 ret void } + +; Without special casing the inline constant check for v_fmac_f32's +; src2, this fails to fold the 1.0 into an fma. + +; FUNC-LABEL: {{^}}fold_inline_imm_into_fmac_src2_f32: +; GFX906: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GFX906: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] + +; GFX906: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] +; GFX906: v_fma_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 +define amdgpu_kernel void @fold_inline_imm_into_fmac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) nounwind { +bb: + %tid = call i32 @llvm.r600.read.tidig.x() + %tid.ext = sext i32 %tid to i64 + %gep.a = getelementptr inbounds float, float addrspace(1)* %a, i64 %tid.ext + %gep.b = getelementptr inbounds float, float addrspace(1)* %b, i64 %tid.ext + %gep.out = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %tmp = load volatile float, float addrspace(1)* %gep.a + %tmp1 = load volatile float, float addrspace(1)* %gep.b + %tmp2 = fadd contract float %tmp, %tmp + %tmp3 = fmul contract float %tmp2, 4.0 + %tmp4 = fsub contract float 1.0, %tmp3 + %tmp5 = fadd contract float %tmp4, %tmp1 + %tmp6 = fadd contract float %tmp1, %tmp1 + %tmp7 = fmul contract float %tmp6, %tmp + %tmp8 = fsub contract float 1.0, %tmp7 + %tmp9 = fmul contract float %tmp8, 8.0 + %tmp10 = fadd contract float %tmp5, %tmp9 + store float %tmp10, float addrspace(1)* %gep.out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll index fee3c95c473..4ef293f9a51 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,12 +1,22 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s + + +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s + +; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. +; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. @@ -19,7 +29,8 @@ declare half @llvm.fmuladd.f16(half, half, half) #1 declare float @llvm.fabs.f32(float) #1 ; GCN-LABEL: {{^}}fmuladd_f32: -; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} @@ -57,10 +68,11 @@ define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspa } ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] ; SI-FLUSH: buffer_store_dword [[R2]] ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] @@ -86,12 +98,14 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a } ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] @@ -99,7 +113,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -115,12 +129,13 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a } ; GCN-LABEL: {{^}}fadd_a_a_b_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] + ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] @@ -131,7 +146,7 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float a ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { @@ -150,12 +165,13 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, } ; GCN-LABEL: {{^}}fadd_b_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] + ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] @@ -166,7 +182,7 @@ define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) #0 { @@ -185,9 +201,10 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, } ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] @@ -195,7 +212,7 @@ define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out, ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -210,13 +227,16 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo ret void } +; XXX ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] @@ -224,7 +244,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -242,12 +262,14 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, } ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], + +; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] -; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] @@ -255,7 +277,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -273,11 +295,13 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo } ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], +; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] +; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] + ; SI-FLUSH: buffer_store_dword [[RESULT]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] @@ -285,7 +309,7 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -303,9 +327,9 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo } ; GCN-LABEL: {{^}}mad_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] @@ -317,7 +341,7 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, flo ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -337,9 +361,9 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou } ; GCN-LABEL: {{^}}mad_sub_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] @@ -352,7 +376,7 @@ define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %ou ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -372,9 +396,9 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}mad_sub_fabs_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| @@ -386,7 +410,7 @@ define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -407,10 +431,11 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur } ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] +; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| +; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| @@ -421,7 +446,7 @@ define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -442,13 +467,13 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca } ; GCN-LABEL: {{^}}neg_neg_mad_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] ; SI-FLUSH: buffer_store_dword [[REGC]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] @@ -459,7 +484,7 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias noca ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -481,9 +506,9 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture } ; GCN-LABEL: {{^}}mad_fabs_sub_f32: -; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] @@ -495,7 +520,7 @@ define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -516,11 +541,11 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur } ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] ; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] @@ -531,7 +556,7 @@ define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocaptur ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] ; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -549,8 +574,8 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a } ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: -; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] @@ -562,7 +587,7 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float a ; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] ; SI: buffer_store_dword [[RESULT]] -; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll index da08adc2147..dd38d1d2366 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -21,6 +21,8 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU" @@ -36,3 +38,5 @@ ; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU" ; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU" ; HSA-GFX902: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU" +; HSA-GFX904: .hsa_code_object_isa 9,0,4,"AMD","AMDGPU" +; HSA-GFX906: .hsa_code_object_isa 9,0,6,"AMD","AMDGPU" diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll new file mode 100644 index 00000000000..65ab3e04237 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906 + +declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c) + +; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2 +; GFX906: v_dot2_f32_f16 +define amdgpu_kernel void @test_llvm_amdgcn_fdot2( + float addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + float addrspace(1)* %c) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load float, float addrspace(1)* %c + %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val) + store float %r.val, float addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll new file mode 100644 index 00000000000..0d8f28bbef1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2 +; GFX906: v_dot2_i32_i16 +define amdgpu_kernel void @test_llvm_amdgcn_sdot2( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll new file mode 100644 index 00000000000..8b664e6f9a4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4 +; GFX906: v_dot4_i32_i8 +define amdgpu_kernel void @test_llvm_amdgcn_sdot4( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll new file mode 100644 index 00000000000..e2466eae539 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8 +; GFX906: v_dot8_i32_i4 +define amdgpu_kernel void @test_llvm_amdgcn_sdot8( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll new file mode 100644 index 00000000000..b2912cb2334 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2 +; GFX906: v_dot2_u32_u16 +define amdgpu_kernel void @test_llvm_amdgcn_udot2( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll new file mode 100644 index 00000000000..5ce060de700 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4 +; GFX906: v_dot4_u32_u8 +define amdgpu_kernel void @test_llvm_amdgcn_udot4( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll new file mode 100644 index 00000000000..2599305bc8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 + +declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c) + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8 +; GFX906: v_dot8_u32_u4 +define amdgpu_kernel void @test_llvm_amdgcn_udot8( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index 58f8ab972be..6f56be1a8a2 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1,9 +1,11 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX906,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo: -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] ; VI: v_mac_f32 ; CI: v_mad_f32 define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { @@ -15,7 +17,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2 } ; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_int: -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding ; CIVI: v_mac_f32 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { %src0.hi = lshr i32 %src0, 16 @@ -35,7 +38,8 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src } ; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_elt: -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding ; VI: v_mac_f32 ; CI: v_mad_f32 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -50,9 +54,13 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % } ; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GFX9: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX900: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] + +; GFX906: v_mov_b32_e32 v3, v1 +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] ; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -65,10 +73,15 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: ; GCN: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 +; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 + +; GFX906-NEXT: v_mov_b32_e32 v3, v1 +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -83,9 +96,13 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, } ; GCN-LABEL: {{^}}v_mad_mix_f32_negf16lo_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX9-NEXT: s_setpc_b64 +; GFX900: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX900-NEXT: s_setpc_b64 + +; GFX906: s_waitcnt +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { @@ -98,7 +115,8 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s } ; GCN-LABEL: {{^}}v_mad_mix_f32_absf16lo_f16lo_f16lo: -; GFX9: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] ; CIVI: v_mad_f32 define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { @@ -111,9 +129,13 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s } ; GCN-LABEL: {{^}}v_mad_mix_f32_negabsf16lo_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 +; GFX900: s_waitcnt +; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 + +; GFX906: s_waitcnt +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { @@ -128,7 +150,8 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; GFX9-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 @@ -141,7 +164,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negf32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding ; GFX9-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 @@ -155,7 +179,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_absf32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding ; GFX9-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 @@ -169,7 +194,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negabsf32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding ; GFX9-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 @@ -189,7 +215,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1: ; GCN: s_waitcnt ; GFX9: v_mov_b32_e32 v2, 1.0 -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; CIVI: v_mad_f32 v0, v0, v1, 1.0 ; GCN-NEXT: s_setpc_b64 @@ -203,7 +230,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: ; GCN: s_waitcnt ; GFX9: v_mov_b32_e32 v2, 0.15915494 -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; VI: v_mad_f32 v0, v0, v1, 0.15915494 define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { %src0.ext = fpext half %src0 to float @@ -219,7 +247,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 ; f32 1/2pi = 0x3e22f983 ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: ; GFX9: v_mov_b32_e32 v2, 0x3e230000 -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; CIVI: v_madak_f32 v0, v0, v1, 0x3e230000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { @@ -232,7 +261,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: ; GFX9: v_mov_b32_e32 v2, 0x367c0000 -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; CIVI: v_madak_f32 v0, v0, v1, 0x367c0000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { @@ -246,8 +276,11 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: ; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 1.0 -; GFX9: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX9: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding + +; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -258,8 +291,11 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) ; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: ; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0x3e230000 -; GFX9: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX9: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding + +; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -271,8 +307,12 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: ; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0.15915494 -; GFX9: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX9: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding + +; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding + +; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -282,7 +322,8 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s } ; GCN-LABEL: {{^}}v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX9: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding ; VI: v_mac_f32_e64 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} ; CI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -300,7 +341,7 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h ; GCN-LABEL: no_mix_simple: ; GCN: s_waitcnt -; GCN-NEXT: v_mad_f32 v0, v0, v1, v2 +; GCN-NEXT: v_{{mad|fma}}_f32 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) @@ -309,7 +350,9 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { ; GCN-LABEL: no_mix_simple_fabs: ; GCN: s_waitcnt -; GCN-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2 ; GCN-NEXT: s_setpc_b64 define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { %src0.fabs = call float @llvm.fabs.f32(float %src0) @@ -322,10 +365,10 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { ; v_mad_mix_f32 flushes. ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_fma_f32 +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_fma_f32 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -335,9 +378,12 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr } ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_fma_f32 +; GFX900: v_cvt_f32_f16 +; GFX900: v_cvt_f32_f16 +; GFX900: v_fma_f32 + +; GFX906-NOT: v_cvt_f32_f16 +; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -375,32 +421,35 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float - %mul = fmul float %src0.ext, %src1.ext - %result = fadd float %mul, %src2.ext + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2.ext ret float %result } ; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 { %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float - %mul = fmul float %src0.ext, %src1.ext - %result = fadd float %mul, %src2 + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2 ret float %result } ; GCN-LABEL: {{^}}v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding ; GFX9-NEXT: s_setpc_b64 ; CIVI: v_mad_f32 @@ -418,7 +467,7 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1 ; Make sure we don't fold pre-cvt fneg if we already have a fabs ; GCN-LABEL: {{^}}v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: -; GFX9: s_waitcnt +; GFX900: s_waitcnt define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 @@ -433,7 +482,8 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; GCN-LABEL: {{^}}v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> @@ -448,7 +498,8 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1 ; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> @@ -463,7 +514,8 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half ; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> @@ -478,7 +530,8 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half ; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: ; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] ; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll index 3991e615599..6f0537dfe71 100644 --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX700 %s ; RUN: llc -march=amdgcn -mcpu=gfx800 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX800 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-DL --check-prefix=GFX906 %s ; GCN-LABEL: {{^}}scalar_xnor_i32_one_use ; GCN: s_xnor_b32 @@ -62,6 +63,7 @@ entry: ; GCN-NOT: s_xnor_b32 ; GCN: v_xor_b32 ; GCN: v_not_b32 +; GCN-DL: v_xnor_b32 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { entry: %xor = xor i32 %a, %b @@ -75,6 +77,8 @@ entry: ; GCN: v_xor_b32 ; GCN: v_not_b32 ; GCN: v_not_b32 +; GCN-DL: v_xnor_b32 +; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { entry: %xor = xor i64 %a, %b diff --git a/llvm/test/MC/AMDGPU/dl-insts-err.s b/llvm/test/MC/AMDGPU/dl-insts-err.s new file mode 100644 index 00000000000..9abdaca7ec1 --- /dev/null +++ b/llvm/test/MC/AMDGPU/dl-insts-err.s @@ -0,0 +1,387 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx800 -show-encoding %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx906 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=GFX906 + +// +// Test unsupported GPUs. +// + +// CHECK: error: instruction not supported on this GPU +v_fmac_f32 v0, v1, v2 +// CHECK: error: instruction not supported on this GPU +v_xnor_b32 v0, v1, v2 +// CHECK: error: instruction not supported on this GPU +v_dot2_f32_f16 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot2_i32_i16 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot2_u32_u16 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot4_i32_i8 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot4_u32_u8 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot8_i32_i4 v0, v1, v2, v3 +// CHECK: error: instruction not supported on this GPU +v_dot8_u32_u4 v0, v1, v2, v3 + +// +// Test invalid operands. +// + +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel: +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[ +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[,] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[2,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[2,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi: +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[ +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[,] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[0,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[2,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[2,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo: +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[ +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[,] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[2,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[2,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi: +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[ +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[,] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[2,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[2,2] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel: +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[ +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[,] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,2] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[2,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[2,2] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi: +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[ +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[,] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[0,2] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[2,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[2,2] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[0,0,0,0,0] +// FIXME-GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, v2, v3 neg_lo:[0,0] +// FIXME-GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, v2, v3 neg_hi:[0,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel: +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[ +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[,] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,2] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[2,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[2,2] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0,0,0,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi: +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[ +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[,] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[0,2] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[2,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[2,2] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[0,-1] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[-1,0] +// GFX906: error: failed parsing operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[-1,-1] +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[0,0,0,0,0] +// FIXME-GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, v2, v3 neg_lo:[0,0] +// FIXME-GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, v2, v3 neg_hi:[0,0] + +// +// Test regular modifiers. +// + +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, |v1|, v2, v3 +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, v1, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, |v1|, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, |v1|, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_f32_f16 v0, |v1|, |v2|, |v3| +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, abs(v1), v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, v1, abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, v1, v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, abs(v1), abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, abs(v1), v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, abs(v1), abs(v2), abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, -v1, v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, -v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, -v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_f32_f16 v0, -v1, -v2, -v3 +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, |v1|, v2, v3 +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, v1, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, v1, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, |v1|, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, |v1|, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_i32_i16 v0, |v1|, |v2|, |v3| +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, abs(v1), v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, abs(v1), abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, abs(v1), v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, abs(v1), abs(v2), abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, -v1, v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, -v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, -v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_i32_i16 v0, -v1, -v2, -v3 +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, |v1|, v2, v3 +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, v1, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, v1, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, |v1|, |v2|, v3 +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, |v1|, v2, |v3| +// GFX906: error: not a valid operand +v_dot2_u32_u16 v0, |v1|, |v2|, |v3| +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, abs(v1), v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, abs(v1), abs(v2), v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, abs(v1), v2, abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, abs(v1), abs(v2), abs(v3) +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, -v1, v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, -v1, -v2, v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, -v1, v2, -v3 +// GFX906: error: invalid operand for instruction +v_dot2_u32_u16 v0, -v1, -v2, -v3 + +// +// Test constant bus restrictions. +// + +// GFX906: error: invalid operand (violates constant bus restrictions) +v_dot2_f32_f16 v255, s1, s2, s3 +// GFX906: error: invalid operand (violates constant bus restrictions) +v_dot2_i32_i16 v255, s1, s2, s3 +// GFX906: error: invalid operand (violates constant bus restrictions) +v_dot2_u32_u16 v255, s1, s2, s3 diff --git a/llvm/test/MC/AMDGPU/dl-insts.s b/llvm/test/MC/AMDGPU/dl-insts.s new file mode 100644 index 00000000000..73ef2bf183d --- /dev/null +++ b/llvm/test/MC/AMDGPU/dl-insts.s @@ -0,0 +1,679 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx906 -show-encoding %s | FileCheck %s + +// +// VOP2 Instructions. +// + +// CHECK: encoding: [0x01,0x05,0x0a,0x76] +v_fmac_f32 v5, v1, v2 +// CHECK: encoding: [0x01,0x05,0xfe,0x77] +v_fmac_f32 v255, v1, v2 +// CHECK: encoding: [0xff,0x05,0x0a,0x76] +v_fmac_f32 v5, v255, v2 +// CHECK: encoding: [0x01,0x04,0x0a,0x76] +v_fmac_f32 v5, s1, v2 +// CHECK: encoding: [0x65,0x04,0x0a,0x76] +v_fmac_f32 v5, s101, v2 +// CHECK: encoding: [0x66,0x04,0x0a,0x76] +v_fmac_f32 v5, flat_scratch_lo, v2 +// CHECK: encoding: [0x67,0x04,0x0a,0x76] +v_fmac_f32 v5, flat_scratch_hi, v2 +// CHECK: encoding: [0x6a,0x04,0x0a,0x76] +v_fmac_f32 v5, vcc_lo, v2 +// CHECK: encoding: [0x6b,0x04,0x0a,0x76] +v_fmac_f32 v5, vcc_hi, v2 +// CHECK: encoding: [0x7c,0x04,0x0a,0x76] +v_fmac_f32 v5, m0, v2 +// CHECK: encoding: [0x7e,0x04,0x0a,0x76] +v_fmac_f32 v5, exec_lo, v2 +// CHECK: encoding: [0x7f,0x04,0x0a,0x76] +v_fmac_f32 v5, exec_hi, v2 +// CHECK: encoding: [0x80,0x04,0x0a,0x76] +v_fmac_f32 v5, 0, v2 +// CHECK: encoding: [0xc1,0x04,0x0a,0x76] +v_fmac_f32 v5, -1, v2 +// CHECK: encoding: [0xf0,0x04,0x0a,0x76] +v_fmac_f32 v5, 0.5, v2 +// CHECK: encoding: [0xf7,0x04,0x0a,0x76] +v_fmac_f32 v5, -4.0, v2 +// CHECK: encoding: [0xff,0x04,0x0a,0x76,0x56,0x34,0x12,0xaf] +v_fmac_f32 v5, 0xaf123456, v2 +// CHECK: encoding: [0xff,0x04,0x0a,0x76,0x73,0x72,0x71,0x3f] +v_fmac_f32 v5, 0x3f717273, v2 +// CHECK: encoding: [0x01,0xff,0x0b,0x76] +v_fmac_f32 v5, v1, v255 + +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v5, v1, v2 +// CHECK: encoding: [0xff,0x00,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v255, v1, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0xff,0x05,0x02,0x00] +v_fmac_f32_e64 v5, v255, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x04,0x02,0x00] +v_fmac_f32_e64 v5, s1, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x65,0x04,0x02,0x00] +v_fmac_f32_e64 v5, s101, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x66,0x04,0x02,0x00] +v_fmac_f32_e64 v5, flat_scratch_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x67,0x04,0x02,0x00] +v_fmac_f32_e64 v5, flat_scratch_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x6a,0x04,0x02,0x00] +v_fmac_f32_e64 v5, vcc_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x6b,0x04,0x02,0x00] +v_fmac_f32_e64 v5, vcc_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x7c,0x04,0x02,0x00] +v_fmac_f32_e64 v5, m0, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x7e,0x04,0x02,0x00] +v_fmac_f32_e64 v5, exec_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x7f,0x04,0x02,0x00] +v_fmac_f32_e64 v5, exec_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x80,0x04,0x02,0x00] +v_fmac_f32_e64 v5, 0, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0xc1,0x04,0x02,0x00] +v_fmac_f32_e64 v5, -1, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0xf0,0x04,0x02,0x00] +v_fmac_f32_e64 v5, 0.5, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0xf7,0x04,0x02,0x00] +v_fmac_f32_e64 v5, -4.0, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xff,0x03,0x00] +v_fmac_f32_e64 v5, v1, v255 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x00,0x00] +v_fmac_f32_e64 v5, v1, s2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xcb,0x00,0x00] +v_fmac_f32_e64 v5, v1, s101 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xcd,0x00,0x00] +v_fmac_f32_e64 v5, v1, flat_scratch_lo +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xcf,0x00,0x00] +v_fmac_f32_e64 v5, v1, flat_scratch_hi +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xd5,0x00,0x00] +v_fmac_f32_e64 v5, v1, vcc_lo +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xd7,0x00,0x00] +v_fmac_f32_e64 v5, v1, vcc_hi +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xf9,0x00,0x00] +v_fmac_f32_e64 v5, v1, m0 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xfd,0x00,0x00] +v_fmac_f32_e64 v5, v1, exec_lo +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xff,0x00,0x00] +v_fmac_f32_e64 v5, v1, exec_hi +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x01,0x01,0x00] +v_fmac_f32_e64 v5, v1, 0 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x83,0x01,0x00] +v_fmac_f32_e64 v5, v1, -1 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xe1,0x01,0x00] +v_fmac_f32_e64 v5, v1, 0.5 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0xef,0x01,0x00] +v_fmac_f32_e64 v5, v1, -4.0 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x20] +v_fmac_f32_e64 v5, -v1, v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x40] +v_fmac_f32_e64 v5, v1, -v2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x60] +v_fmac_f32_e64 v5, -v1, -v2 +// CHECK: encoding: [0x05,0x01,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v5, |v1|, v2 +// CHECK: encoding: [0x05,0x02,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v5, v1, |v2| +// CHECK: encoding: [0x05,0x03,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v5, |v1|, |v2| +// CHECK: encoding: [0x05,0x80,0x3b,0xd1,0x01,0x05,0x02,0x00] +v_fmac_f32_e64 v5, v1, v2 clamp +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x08] +v_fmac_f32_e64 v5, v1, v2 mul:2 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x10] +v_fmac_f32_e64 v5, v1, v2 mul:4 +// CHECK: encoding: [0x05,0x00,0x3b,0xd1,0x01,0x05,0x02,0x18] +v_fmac_f32_e64 v5, v1, v2 div:2 + +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x00] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0xfe,0x77,0x01,0xe4,0x00,0x00] +v_fmac_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0xff,0xe4,0x00,0x00] +v_fmac_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0xfe,0x0b,0x76,0x01,0xe4,0x00,0x00] +v_fmac_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1b,0x00,0x00] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x40,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x41,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x42,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x43,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x30,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x34,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x38,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x3c,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x01,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x0f,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x11,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x1f,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x21,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0x2f,0x01,0x00] +v_fmac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x10] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x30] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xf0] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0xf0] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x01] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x03] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x0f] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x00,0x0f] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x08,0x00] +v_fmac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x10,0x00] +v_fmac_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x20,0x00] +v_fmac_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x40,0x00] +v_fmac_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// CHECK: encoding: [0xfa,0x04,0x0a,0x76,0x01,0xe4,0x80,0x00] +v_fmac_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 + +// CHECK: encoding: [0x01,0x05,0x0a,0x7a] +v_xnor_b32 v5, v1, v2 +// CHECK: encoding: [0x01,0x05,0xfe,0x7b] +v_xnor_b32 v255, v1, v2 +// CHECK: encoding: [0xff,0x05,0x0a,0x7a] +v_xnor_b32 v5, v255, v2 +// CHECK: encoding: [0x01,0x04,0x0a,0x7a] +v_xnor_b32 v5, s1, v2 +// CHECK: encoding: [0x65,0x04,0x0a,0x7a] +v_xnor_b32 v5, s101, v2 +// CHECK: encoding: [0x66,0x04,0x0a,0x7a] +v_xnor_b32 v5, flat_scratch_lo, v2 +// CHECK: encoding: [0x67,0x04,0x0a,0x7a] +v_xnor_b32 v5, flat_scratch_hi, v2 +// CHECK: encoding: [0x6a,0x04,0x0a,0x7a] +v_xnor_b32 v5, vcc_lo, v2 +// CHECK: encoding: [0x6b,0x04,0x0a,0x7a] +v_xnor_b32 v5, vcc_hi, v2 +// CHECK: encoding: [0x7c,0x04,0x0a,0x7a] +v_xnor_b32 v5, m0, v2 +// CHECK: encoding: [0x7e,0x04,0x0a,0x7a] +v_xnor_b32 v5, exec_lo, v2 +// CHECK: encoding: [0x7f,0x04,0x0a,0x7a] +v_xnor_b32 v5, exec_hi, v2 +// CHECK: encoding: [0x80,0x04,0x0a,0x7a] +v_xnor_b32 v5, 0, v2 +// CHECK: encoding: [0xc1,0x04,0x0a,0x7a] +v_xnor_b32 v5, -1, v2 +// CHECK: encoding: [0xf0,0x04,0x0a,0x7a] +v_xnor_b32 v5, 0.5, v2 +// CHECK: encoding: [0xf7,0x04,0x0a,0x7a] +v_xnor_b32 v5, -4.0, v2 +// CHECK: encoding: [0xff,0x04,0x0a,0x7a,0x56,0x34,0x12,0xaf] +v_xnor_b32 v5, 0xaf123456, v2 +// CHECK: encoding: [0xff,0x04,0x0a,0x7a,0x73,0x72,0x71,0x3f] +v_xnor_b32 v5, 0x3f717273, v2 +// CHECK: encoding: [0x01,0xff,0x0b,0x7a] +v_xnor_b32 v5, v1, v255 + +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0x05,0x02,0x00] +v_xnor_b32_e64 v5, v1, v2 +// CHECK: encoding: [0xff,0x00,0x3d,0xd1,0x01,0x05,0x02,0x00] +v_xnor_b32_e64 v255, v1, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0xff,0x05,0x02,0x00] +v_xnor_b32_e64 v5, v255, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0x04,0x02,0x00] +v_xnor_b32_e64 v5, s1, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x65,0x04,0x02,0x00] +v_xnor_b32_e64 v5, s101, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x66,0x04,0x02,0x00] +v_xnor_b32_e64 v5, flat_scratch_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x67,0x04,0x02,0x00] +v_xnor_b32_e64 v5, flat_scratch_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x6a,0x04,0x02,0x00] +v_xnor_b32_e64 v5, vcc_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x6b,0x04,0x02,0x00] +v_xnor_b32_e64 v5, vcc_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x7c,0x04,0x02,0x00] +v_xnor_b32_e64 v5, m0, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x7e,0x04,0x02,0x00] +v_xnor_b32_e64 v5, exec_lo, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x7f,0x04,0x02,0x00] +v_xnor_b32_e64 v5, exec_hi, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x80,0x04,0x02,0x00] +v_xnor_b32_e64 v5, 0, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0xc1,0x04,0x02,0x00] +v_xnor_b32_e64 v5, -1, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0xf0,0x04,0x02,0x00] +v_xnor_b32_e64 v5, 0.5, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0xf7,0x04,0x02,0x00] +v_xnor_b32_e64 v5, -4.0, v2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xff,0x03,0x00] +v_xnor_b32_e64 v5, v1, v255 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0x05,0x00,0x00] +v_xnor_b32_e64 v5, v1, s2 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xcb,0x00,0x00] +v_xnor_b32_e64 v5, v1, s101 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xcd,0x00,0x00] +v_xnor_b32_e64 v5, v1, flat_scratch_lo +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xcf,0x00,0x00] +v_xnor_b32_e64 v5, v1, flat_scratch_hi +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xd5,0x00,0x00] +v_xnor_b32_e64 v5, v1, vcc_lo +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xd7,0x00,0x00] +v_xnor_b32_e64 v5, v1, vcc_hi +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xf9,0x00,0x00] +v_xnor_b32_e64 v5, v1, m0 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xfd,0x00,0x00] +v_xnor_b32_e64 v5, v1, exec_lo +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xff,0x00,0x00] +v_xnor_b32_e64 v5, v1, exec_hi +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0x01,0x01,0x00] +v_xnor_b32_e64 v5, v1, 0 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0x83,0x01,0x00] +v_xnor_b32_e64 v5, v1, -1 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xe1,0x01,0x00] +v_xnor_b32_e64 v5, v1, 0.5 +// CHECK: encoding: [0x05,0x00,0x3d,0xd1,0x01,0xef,0x01,0x00] +v_xnor_b32_e64 v5, v1, -4.0 + +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0xfe,0x7b,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0xff,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x65,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, s101, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x66,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, flat_scratch_lo, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x67,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, flat_scratch_hi, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x6a,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, vcc_lo, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x6b,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, vcc_hi, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x7c,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, m0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x7e,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, exec_lo, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x7f,0x06,0x86,0x06] +v_xnor_b32_sdwa v5, exec_hi, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0xfe,0x0b,0x7a,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x00,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x01,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x02,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x03,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x04,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x05,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x0e,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x16,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x16,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x00,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x01,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x02,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x03,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x04,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x05,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x0e,0x06] +v_xnor_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x06] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x00] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x01] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x02] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x03] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x04] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x05] +v_xnor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +// CHECK: encoding: [0xf9,0x04,0x0a,0x7a,0x01,0x06,0x06,0x0e] +v_xnor_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +// +// VOP3P Instructions. +// + +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 +// CHECK: encoding: [0x00,0x40,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 +// CHECK: encoding: [0x00,0x40,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 +// CHECK: [0x00,0x00,0xa8,0xd3,0x01,0x05,0x0e,0x04] +v_dot4_i32_i8 v0, v1, v2, v3 +// CHECK: [0x00,0x00,0xa9,0xd3,0x01,0x05,0x0e,0x04] +v_dot4_u32_u8 v0, v1, v2, v3 +// CHECK: [0x00,0x00,0xaa,0xd3,0x01,0x05,0x0e,0x04] +v_dot8_i32_i4 v0, v1, v2, v3 +// CHECK: [0x00,0x00,0xab,0xd3,0x01,0x05,0x0e,0x04] +v_dot8_u32_u4 v0, v1, v2, v3 + +// +// Test op_sel/op_sel_hi. +// + +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] +// CHECK: encoding: [0x00,0x50,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] +// CHECK: encoding: [0x00,0x48,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,0] +// CHECK: encoding: [0x00,0x58,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,1] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x10,0xa3,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x10,0xa3,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x10,0xa3,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x10,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x08,0xa3,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x08,0xa3,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x08,0xa3,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x08,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x18,0xa3,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x18,0xa3,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x18,0xa3,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x18,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x40,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0] +// CHECK: encoding: [0x00,0x50,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,1] +// CHECK: encoding: [0x00,0x48,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,0] +// CHECK: encoding: [0x00,0x58,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,1] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x10,0xa6,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x10,0xa6,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x10,0xa6,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x10,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x08,0xa6,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x08,0xa6,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x08,0xa6,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x08,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x18,0xa6,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x18,0xa6,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x18,0xa6,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x18,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x40,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0] +// CHECK: encoding: [0x00,0x50,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,1] +// CHECK: encoding: [0x00,0x48,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,0] +// CHECK: encoding: [0x00,0x58,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x00,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x10,0xa7,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x10,0xa7,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x10,0xa7,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x10,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x08,0xa7,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x08,0xa7,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x08,0xa7,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x08,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1] +// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x04] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0] +// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x14] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1] +// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x0c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0] +// CHECK: encoding: [0x00,0x18,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1] + +// +// Test clamp. +// + +// CHECK: encoding: [0x00,0xc0,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 clamp +// CHECK: encoding: [0x00,0xc0,0xa6,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_i32_i16 v0, v1, v2, v3 clamp +// CHECK: encoding: [0x00,0xc0,0xa7,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_u32_u16 v0, v1, v2, v3 clamp + +// +// Test neg_lo/neg_hi. +// + +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x40,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[0,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x41,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[1,0,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x42,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[0,1,0] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x44,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[0,0,1] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x43,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[1,1,0] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x45,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[1,0,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x1c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x3c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,0] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x5c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,1,0] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x9c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,1] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0x7c] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0xbc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,0,1] neg_hi:[1,1,1] +// CHECK: encoding: [0x00,0x47,0xa3,0xd3,0x01,0x05,0x0e,0xfc] +v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] diff --git a/llvm/test/MC/AMDGPU/fma-mix.s b/llvm/test/MC/AMDGPU/fma-mix.s new file mode 100644 index 00000000000..2e0bca2ea87 --- /dev/null +++ b/llvm/test/MC/AMDGPU/fma-mix.s @@ -0,0 +1,102 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx906 -show-encoding %s | FileCheck -check-prefix=GFX9-FMAMIX %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9-MADMIX-ERR %s + +v_fma_mix_f32 v0, v1, v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-MADMIX-ERR: error: instruction not supported on this GPU + +v_fma_mixlo_f16 v0, v1, v2, v3 +// GFX9-FMAMIX: v_fma_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-MADMIX-ERR: error: instruction not supported on this GPU + +v_fma_mixhi_f16 v0, v1, v2, v3 +// GFX9-FMAMIX: v_fma_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-MADMIX-ERR: error: instruction not supported on this GPU + +// +// Regular source modifiers on non-packed instructions +// + +v_fma_mix_f32 v0, abs(v1), v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +// FIXME: Better error +// GFX9-MADMIX-ERR: error: invalid operand for instruction + +v_fma_mix_f32 v0, v1, abs(v2), v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, abs(v3) +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, -v1, v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_fma_mix_f32 v0, v1, -v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_fma_mix_f32 v0, v1, v2, -v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_fma_mix_f32 v0, -abs(v1), v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_fma_mix_f32 v0, v1, -abs(v2), v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_fma_mix_f32 v0, v1, v2, -abs(v3) +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) +// GFX9-FMAMIX: v_fma_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44] + +v_fma_mixhi_f16 v0, -v1, abs(v2), -abs(v3) +// GFX9-FMAMIX: v_fma_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4] + +v_fma_mixlo_f16 v0, v1, v2, v3 clamp +// GFX9-FMAMIX: v_fma_mixlo_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa1,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mixhi_f16 v0, v1, v2, v3 clamp +// GFX9-FMAMIX: v_fma_mixhi_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa2,0xd3,0x01,0x05,0x0e,0x04] + +// +// op_sel with non-packed instructions +// + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// FIXME: Better error +// GFX-MADMIX-ERR: error: unknown token in expression + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x00,0x08,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x00,0x10,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x00,0x38,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] +// GFX9-FMAMIX: v_fma_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_fma_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp +// GFX9-FMAMIX: v_fma_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x0c] + +v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp +// GFX9-FMAMIX: v_fma_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa2,0xd3,0x01,0x05,0x0e,0x0c] diff --git a/llvm/test/MC/AMDGPU/mad-mix.s b/llvm/test/MC/AMDGPU/mad-mix.s new file mode 100644 index 00000000000..9d696c7250e --- /dev/null +++ b/llvm/test/MC/AMDGPU/mad-mix.s @@ -0,0 +1,102 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9-MADMIX %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx906 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9-FMAMIX-ERR %s + +v_mad_mix_f32 v0, v1, v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-FMAMIX-ERR: error: instruction not supported on this GPU + +v_mad_mixlo_f16 v0, v1, v2, v3 +// GFX9-MADMIX: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-FMAMIX-ERR: error: instruction not supported on this GPU + +v_mad_mixhi_f16 v0, v1, v2, v3 +// GFX9-MADMIX: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] +// GFX9-FMAMIX-ERR: error: instruction not supported on this GPU + +// +// Regular source modifiers on non-packed instructions +// + +v_mad_mix_f32 v0, abs(v1), v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +// FIXME: Better error +// GFX9-FMAMIX-ERR: error: invalid operand for instruction + +v_mad_mix_f32 v0, v1, abs(v2), v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, abs(v3) +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, -v1, v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_mad_mix_f32 v0, v1, -v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mix_f32 v0, v1, v2, -v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_mad_mix_f32 v0, -abs(v1), v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24] + +v_mad_mix_f32 v0, v1, -abs(v2), v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mix_f32 v0, v1, v2, -abs(v3) +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84] + +v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3) +// GFX9-MADMIX: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44] + +v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3) +// GFX9-MADMIX: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4] + +v_mad_mixlo_f16 v0, v1, v2, v3 clamp +// GFX9-MADMIX: v_mad_mixlo_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa1,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mixhi_f16 v0, v1, v2, v3 clamp +// GFX9-MADMIX: v_mad_mixhi_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa2,0xd3,0x01,0x05,0x0e,0x04] + +// +// op_sel with non-packed instructions +// + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +// FIXME: Better error +// GFX-FMAMIX-ERR: error: unknown token in expression + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x00,0x08,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x00,0x10,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x00,0x38,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04] + +v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] +// GFX9-MADMIX: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] + +v_mad_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp +// GFX9-MADMIX: v_mad_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x0c] + +v_mad_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp +// GFX9-MADMIX: v_mad_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa2,0xd3,0x01,0x05,0x0e,0x0c] diff --git a/llvm/test/MC/AMDGPU/vop3p.s b/llvm/test/MC/AMDGPU/vop3p.s index a716ed44057..1a0741247cc 100644 --- a/llvm/test/MC/AMDGPU/vop3p.s +++ b/llvm/test/MC/AMDGPU/vop3p.s @@ -1,5 +1,4 @@ // RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck -check-prefix=GFX9 %s - // // Test op_sel/op_sel_hi // @@ -167,95 +166,3 @@ v_pk_min_f16 v0, v1, v2 v_pk_max_f16 v0, v1, v2 // GFX9: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x92,0xd3,0x01,0x05,0x02,0x18] - -v_mad_mix_f32 v0, v1, v2, v3 -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mixlo_f16 v0, v1, v2, v3 -// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mixhi_f16 v0, v1, v2, v3 -// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04] - -// -// Regular source modifiers on non-packed instructions -// - -v_mad_mix_f32 v0, abs(v1), v2, v3 -// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, abs(v2), v3 -// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, abs(v3) -// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, -v1, v2, v3 -// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24] - -v_mad_mix_f32 v0, v1, -v2, v3 -// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44] - -v_mad_mix_f32 v0, v1, v2, -v3 -// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84] - -v_mad_mix_f32 v0, -abs(v1), v2, v3 -// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24] - -v_mad_mix_f32 v0, v1, -abs(v2), v3 -// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44] - -v_mad_mix_f32 v0, v1, v2, -abs(v3) -// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84] - -v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3) -// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44] - -v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3) -// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4] - -v_mad_mixlo_f16 v0, v1, v2, v3 clamp -// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa1,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mixhi_f16 v0, v1, v2, v3 clamp -// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 clamp ; encoding: [0x00,0x80,0xa2,0xd3,0x01,0x05,0x0e,0x04] - -// -// op_sel with non-packed instructions -// - -v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,0,0] ; encoding: [0x00,0x08,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,1,0] ; encoding: [0x00,0x10,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[0,0,1] ; encoding: [0x00,0x20,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel:[1,1,1] ; encoding: [0x00,0x38,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,0,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x0c] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,1,0] ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x14] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[0,0,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x04] - -v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] -// GFX9: v_mad_mix_f32 v0, v1, v2, v3 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x01,0x05,0x0e,0x1c] - -v_mad_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp -// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa1,0xd3,0x01,0x05,0x0e,0x0c] - -v_mad_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp -// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 op_sel_hi:[1,0,1] clamp ; encoding: [0x00,0xc0,0xa2,0xd3,0x01,0x05,0x0e,0x0c] diff --git a/llvm/test/MC/Disassembler/AMDGPU/mad_mix.txt b/llvm/test/MC/Disassembler/AMDGPU/mad_mix.txt new file mode 100644 index 00000000000..b328400b52c --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/mad_mix.txt @@ -0,0 +1,6 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX900 +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx906 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX906 + +# GFX900: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +# GFX906: v_fma_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04] +0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04 diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml index 585364b1cc2..c3800d2ff27 100644 --- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -85,6 +85,13 @@ # RUN: yaml2obj -docnum=29 %s > %t.o.29 # RUN: llvm-readobj -s -file-headers %t.o.29 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX902 %s # RUN: obj2yaml %t.o.29 | FileCheck --check-prefixes=YAML-GFX902 %s +# RUN: yaml2obj -docnum=30 %s > %t.o.30 +# RUN: llvm-readobj -s -file-headers %t.o.30 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX904 %s +# RUN: obj2yaml %t.o.30 | FileCheck --check-prefixes=YAML-GFX904 %s +# RUN: yaml2obj -docnum=31 %s > %t.o.31 +# RUN: llvm-readobj -s -file-headers %t.o.31 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX906 %s +# RUN: obj2yaml %t.o.31 | FileCheck --check-prefixes=YAML-GFX906 %s + # ELF-ALL: Flags [ # ELF-R600: EF_AMDGPU_MACH_R600_R600 (0x1) @@ -116,6 +123,8 @@ # ELF-GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B) # ELF-GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) # ELF-GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D) +# ELF-GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) +# ELF-GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) # ELF-ALL: ] # YAML-R600: Flags: [ EF_AMDGPU_MACH_R600_R600 ] @@ -147,6 +156,8 @@ # YAML-GFX810: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX810 ] # YAML-GFX900: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX900 ] # YAML-GFX902: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX902 ] +# YAML-GFX904: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX904 ] +# YAML-GFX906: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX906 ] # Doc1 --- !ELF @@ -466,3 +477,25 @@ FileHeader: Machine: EM_AMDGPU Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX902 ] ... + +# Doc30 +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + OSABI: ELFOSABI_NONE + Type: ET_REL + Machine: EM_AMDGPU + Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX904 ] +... + +# Doc31 +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + OSABI: ELFOSABI_NONE + Type: ET_REL + Machine: EM_AMDGPU + Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX906 ] +... diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 0adacf9bf5b..aa4ce25daed 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1292,6 +1292,8 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = { LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX810), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX900), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX902), + LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904), + LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK) }; |