diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-07-11 00:10:17 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2019-07-11 00:10:17 +0000 |
commit | e93279fd1b00f24c9cc1646faabf853cfa232a8a (patch) | |
tree | 8c1eb148c6a5f803196a91ab55027eaeaf748388 | |
parent | 31c4d2a40d12efb6be340b93a84fb582f7897f5f (diff) | |
download | bcm5719-llvm-e93279fd1b00f24c9cc1646faabf853cfa232a8a.tar.gz bcm5719-llvm-e93279fd1b00f24c9cc1646faabf853cfa232a8a.zip |
[AMDGPU] gfx908 atomic fadd and atomic pk_fadd
Differential Revision: https://reviews.llvm.org/D64435
llvm-svn: 365717
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 23 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 58 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 28 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 70 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 25 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll | 72 | ||||
-rw-r--r-- | llvm/test/MC/AMDGPU/atomic-fadd-insts.s | 110 | ||||
-rw-r--r-- | llvm/test/MC/Disassembler/AMDGPU/atomic-fadd-insts.txt | 103 |
11 files changed, 503 insertions, 4 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f546f535b41..5c90607d2f4 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1642,6 +1642,29 @@ def int_amdgcn_udot8 : //===----------------------------------------------------------------------===// // MI-100 intrinsics // ===----------------------------------------------------------------------===// + +class AMDGPUBufferAtomicNoRtn : Intrinsic < + [], + [llvm_anyfloat_ty, // vdata(VGPR) + llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // vindex(VGPR) + llvm_i32_ty, // offset(SGPR/VGPR/imm) + llvm_i1_ty], // slc(imm) + [], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; + +class AMDGPUGlobalAtomicNoRtn : Intrinsic < + [], + [llvm_anyptr_ty, // vaddr + llvm_anyfloat_ty], // vdata(VGPR) + [IntrArgMemOnly, NoCapture<0>], "", [SDNPMemOperand]>; + +def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicNoRtn; +def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicNoRtn; + +//===----------------------------------------------------------------------===// +// MI-100 intrinsics +// ===----------------------------------------------------------------------===// // llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32i32_ty], [llvm_float_ty, llvm_float_ty, llvm_v32i32_ty, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 0ccd58d44aa..779166bd059 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4428,6 +4428,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) + NODE_NAME_CASE(ATOMIC_FADD) + NODE_NAME_CASE(ATOMIC_PK_FADD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 40ff24f0754..fe7ad694943 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -533,6 +533,10 @@ enum NodeType : unsigned { BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_PK_FADD, + ATOMIC_FADD, + ATOMIC_PK_FADD, LAST_AMDGPU_ISD_NUMBER }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index d5215ab9601..b29b0e7b17e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -546,11 +546,13 @@ class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag< def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; +class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + multiclass global_binary_atomic_op<SDNode atomic_op> { - def "" : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; + def "" : global_binary_atomic_op_frag<atomic_op>; def _noret : PatFrag< (ops node:$ptr, node:$value), diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 02b054f089f..bc70d138e42 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1021,6 +1021,17 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1297,6 +1308,46 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">; defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">; +multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, + string opcode> { + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, 0, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (extract_slc $cachepolicy)) + >; + + def : GCNPat< + (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) + >; +} + +defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">; +defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">; + def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, @@ -2176,6 +2227,13 @@ def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts + class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : MTBUF_Real<ps>, Enc64, diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 470458338c4..df334790b85 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -683,6 +683,16 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>; } // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1 +let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_add_f32", VGPR_32, f32, atomic_add_global +>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global +>; + +} // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// // Flat Patterns @@ -744,6 +754,11 @@ class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, (inst $vaddr, $data, $offset, $slc) >; +class FlatAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), + (inst $vaddr, $data, $offset, $slc) +>; + class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), @@ -829,6 +844,9 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } // End OtherPredicates = [HasFlatAddressSpace] +def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>; +def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>; + let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>; @@ -906,6 +924,9 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>; +def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>; + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 @@ -1326,3 +1347,10 @@ defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_gfx10<0x022>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_gfx10<0x023>; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_gfx10<0x024>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_gfx10<0x025>; + +let SubtargetPredicate = HasAtomicFaddInsts in { + +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>; + +} // End SubtargetPredicate = HasAtomicFaddInsts diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8568d015ab5..e8bc5c85f4e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -950,6 +950,33 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(1)); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); + if (!Vol || !Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType() + ->getPointerElementType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + return true; + } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -6858,6 +6885,49 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getOperand(2).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD + : AMDGPUISD::BUFFER_ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_global_atomic_fadd: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // ptr + Op.getOperand(3) // vdata + }; + EVT VT = Op.getOperand(3).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD + : AMDGPUISD::ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c2874733708..5792efedb90 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -175,6 +175,19 @@ class SDBufferAtomic<string opcode> : SDNode <opcode, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +class SDBufferAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, + SDTypeProfile<0, 8, + [SDTCisVT<0, ty>, // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>, // idxen(imm) + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; @@ -185,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; +def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -201,6 +216,16 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; +class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode, + SDTypeProfile<0, 2, + [SDTCisPtrTy<0>, // vaddr + SDTCisVT<1, ty>]>, // vdata + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + +def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>; +def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>; + def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll new file mode 100644 index 00000000000..eb59c691ef6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -0,0 +1,72 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN + +declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) +declare void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)*, float) +declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) + +; GCN-LABEL: {{^}}buffer_atomic_add_f32: +; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: +; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc +define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + ret void +} + +; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16: +; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: +; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc +define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_add_f32: +; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off +define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) { +main_body: + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_add_f32_off4: +; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 +define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) { +main_body: + %p = getelementptr float, float addrspace(1)* %ptr, i64 1 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %p, float %data) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16: +; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off +define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +main_body: + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: +; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 +define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +main_body: + %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1 + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + ret void +} diff --git a/llvm/test/MC/AMDGPU/atomic-fadd-insts.s b/llvm/test/MC/AMDGPU/atomic-fadd-insts.s new file mode 100644 index 00000000000..a0a516e4d77 --- /dev/null +++ b/llvm/test/MC/AMDGPU/atomic-fadd-insts.s @@ -0,0 +1,110 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=GFX908 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX908-ERR %s + +buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v255, off, s[8:11], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0xff,0x02,0x03] + +buffer_atomic_add_f32 v5, off, s[12:15], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x03,0x03] + +buffer_atomic_add_f32 v5, off, s[96:99], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x18,0x03] + +buffer_atomic_add_f32 v5, off, s[8:11], s101 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x65] + +buffer_atomic_add_f32 v5, off, s[8:11], m0 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x7c] + +buffer_atomic_add_f32 v5, off, s[8:11], 0 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x80] + +buffer_atomic_add_f32 v5, off, s[8:11], -1 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xc1] + +buffer_atomic_add_f32 v5, v0, s[8:11], s3 idxen offset:4095 +// GFX908: encoding: [0xff,0x2f,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v5, v0, s[8:11], s3 offen offset:4095 +// GFX908: encoding: [0xff,0x1f,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v5, off, s[8:11], s3 +// GFX908: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v5, off, s[8:11], s3 +// GFX908: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:7 +// GFX908: encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 glc +// GFX908-ERR: error: invalid operand for instruction + +buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 slc +// GFX908: encoding: [0xff,0x0f,0x36,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0xff,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x03,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x18,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x65] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x7c] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], 0 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x80] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], -1 offset:4095 +// GFX908: encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0xc1] + +buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:4095 +// GFX908: encoding: [0xff,0x2f,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:4095 +// GFX908: encoding: [0xff,0x1f,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 +// GFX908: encoding: [0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 +// GFX908: encoding: [0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7 +// GFX908: encoding: [0x07,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 glc +// GFX908-ERR: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 slc +// GFX908: encoding: [0xff,0x0f,0x3a,0xe1,0x00,0x05,0x02,0x03] + +global_atomic_add_f32 v[1:2], v2, off offset:-1 +// GFX908: encoding: [0xff,0x9f,0x34,0xdd,0x01,0x02,0x7f,0x00] + +global_atomic_add_f32 v[1:2], v255, off offset:-1 +// GFX908: encoding: [0xff,0x9f,0x34,0xdd,0x01,0xff,0x7f,0x00] + +global_atomic_add_f32 v[1:2], v2, off +// GFX908: encoding: [0x00,0x80,0x34,0xdd,0x01,0x02,0x7f,0x00] + +global_atomic_pk_add_f16 v[1:2], v2, off offset:-1 +// GFX908: encoding: [0xff,0x9f,0x38,0xdd,0x01,0x02,0x7f,0x00] + +global_atomic_pk_add_f16 v[1:2], v255, off offset:-1 +// GFX908: encoding: [0xff,0x9f,0x38,0xdd,0x01,0xff,0x7f,0x00] + +global_atomic_pk_add_f16 v[1:2], v2, off +// GFX908: encoding: [0x00,0x80,0x38,0xdd,0x01,0x02,0x7f,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/atomic-fadd-insts.txt b/llvm/test/MC/Disassembler/AMDGPU/atomic-fadd-insts.txt new file mode 100644 index 00000000000..20988a40f98 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/atomic-fadd-insts.txt @@ -0,0 +1,103 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck %s + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0xff,0x02,0x03] +0xff,0x0f,0x34,0xe1,0x00,0xff,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x03,0x03] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x03,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x18,0x03] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x18,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x65] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x65 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x7c] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x7c + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x80] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x80 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xc1] +0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xc1 + +# CHECK: buffer_atomic_add_f32 v5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x34,0xe1,0x00,0x05,0x02,0x03] +0xff,0x2f,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x34,0xe1,0x00,0x05,0x02,0x03] +0xff,0x1f,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] +0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] +0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] +0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x36,0xe1,0x00,0x05,0x02,0x03] +0xff,0x0f,0x36,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x03] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0xff,0x02,0x03] +0xff,0x0f,0x38,0xe1,0x00,0xff,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x03,0x03] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x03,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x18,0x03] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x18,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x65] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x65 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x7c] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x7c + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x80] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0x80 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0xc1] +0xff,0x0f,0x38,0xe1,0x00,0x05,0x02,0xc1 + +# CHECK: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x38,0xe1,0x00,0x05,0x02,0x03] +0xff,0x2f,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe1,0x00,0x05,0x02,0x03] +0xff,0x1f,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] +0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] +0x00,0x00,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] +0x07,0x00,0x38,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x3a,0xe1,0x00,0x05,0x02,0x03] +0xff,0x0f,0x3a,0xe1,0x00,0x05,0x02,0x03 + +# CHECK: global_atomic_add_f32 v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x34,0xdd,0x01,0x02,0x7f,0x00] +0xff,0x9f,0x34,0xdd,0x01,0x02,0x7f,0x00 + +# CHECK: global_atomic_add_f32 v[1:2], v255, off offset:-1 ; encoding: [0xff,0x9f,0x34,0xdd,0x01,0xff,0x7f,0x00] +0xff,0x9f,0x34,0xdd,0x01,0xff,0x7f,0x00 + +# CHECK: global_atomic_add_f32 v[1:2], v2, off ; encoding: [0x00,0x80,0x34,0xdd,0x01,0x02,0x7f,0x00] +0x00,0x80,0x34,0xdd,0x01,0x02,0x7f,0x00 + +# CHECK: global_atomic_pk_add_f16 v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x38,0xdd,0x01,0x02,0x7f,0x00] +0xff,0x9f,0x38,0xdd,0x01,0x02,0x7f,0x00 + +# CHECK: global_atomic_pk_add_f16 v[1:2], v255, off offset:-1 ; encoding: [0xff,0x9f,0x38,0xdd,0x01,0xff,0x7f,0x00] +0xff,0x9f,0x38,0xdd,0x01,0xff,0x7f,0x00 + +# CHECK: global_atomic_pk_add_f16 v[1:2], v2, off ; encoding: [0x00,0x80,0x38,0xdd,0x01,0x02,0x7f,0x00] +0x00,0x80,0x38,0xdd,0x01,0x02,0x7f,0x00 |