diff options
author | Nicolai Haehnle <nhaehnle@gmail.com> | 2019-07-01 17:17:45 +0000 |
---|---|---|
committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2019-07-01 17:17:45 +0000 |
commit | 4dc3b2bf95b0910facc12a84118239237ba92e18 (patch) | |
tree | 3f2b108806dc4776e1207ea238f7334b9229a76f | |
parent | ddc57afab9ef4e1cf708dc5454c0842c3e68f1e0 (diff) | |
download | bcm5719-llvm-4dc3b2bf95b0910facc12a84118239237ba92e18.tar.gz bcm5719-llvm-4dc3b2bf95b0910facc12a84118239237ba92e18.zip |
AMDGPU: Support GDS atomics
Summary:
Original patch by Marek Olšák
Change-Id: Ia97d5d685a63a377d86e82942436d1fe6e429bab
Reviewers: mareko, arsenm, rampitec
Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, jfb, Petar.Avramovic, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D63452
llvm-svn: 364814
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 21 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 88 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/gds-atomic.ll | 128 |
9 files changed, 225 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index da2cf7b076d..c8d4557729b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -568,8 +568,6 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); - // Write max value to m0 before each load operation - assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain"); SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), @@ -587,10 +585,17 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { - if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || - !Subtarget->ldsRequiresM0Init()) - return N; - return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + unsigned AS = cast<MemSDNode>(N)->getAddressSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + if (Subtarget->ldsRequiresM0Init()) + return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + } else if (AS == AMDGPUAS::REGION_ADDRESS) { + MachineFunction &MF = CurDAG->getMachineFunction(); + unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize(); + return + glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32)); + } + return N; } MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index be0ba2fe7ae..7c077cdaf05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -353,6 +353,10 @@ class LocalAddress : CodePatPred<[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class RegionAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + class GlobalAddress : CodePatPred<[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; @@ -402,6 +406,9 @@ class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress; class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress; class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress; +class RegionLoad <SDPatternOperator op> : LoadFrag <op>, RegionAddress; +class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress; + class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress; class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress; @@ -497,6 +504,13 @@ class local_binary_atomic_op<SDNode atomic_op> : return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class region_binary_atomic_op<SDNode atomic_op> : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + + def atomic_swap_local : local_binary_atomic_op<atomic_swap>; def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; @@ -521,6 +535,13 @@ class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; +class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast<AtomicSDNode>(N); + return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; +}]>; + def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; multiclass global_binary_atomic_op<SDNode atomic_op> { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index e198a9c7eb8..aaed280a127 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -142,7 +142,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned Threshold = 0; if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == AMDGPUAS::LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -160,7 +160,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 7890fa1502e..b67306107db 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -601,9 +601,9 @@ def : GCNPat < (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; -class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) + (inst $ptr, (as_i16imm $offset), (i1 gds)) >; multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { @@ -657,9 +657,9 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2i16>; def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } -class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { @@ -734,75 +734,79 @@ defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) + (inst $ptr, $value, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag)>; + !cast<PatFrag>(frag#"_local")>; } + + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; } -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds)) >; multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { - def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>; + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>; } let OtherPredicates = [NotLDSRequiresM0Init] in { def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, - !cast<PatFrag>(frag)>; + !cast<PatFrag>(frag#"_local")>; } + + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>; } // 32-bit atomics. -defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">; -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">; +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">; +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">; // 64-bit atomics. -defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">; -defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">; -defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">; -defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">; -defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">; -defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">; -defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">; -defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">; -defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">; -defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">; - -defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">; +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax">; + +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap">; def : Pat < (SIds_ordered_count i32:$value, i16:$offset), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 398f6887644..44d9987c1f7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1173,7 +1173,7 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; @@ -7135,7 +7135,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -7557,7 +7557,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && VT.getStoreSize() == 16 && NumElements != 3) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index a988a504ccf..b53b04abefc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -505,6 +505,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, >; def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; } defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; @@ -528,6 +529,7 @@ def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, >; def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>; +def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>; def as_i1imm : SDNodeXForm<imm, [{ diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d6c93fd293a..8bb8ea71332 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -46,7 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + HighBitsOf32BitAddress(0), + GDSSize(0) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -159,6 +160,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + S = F.getFnAttribute("amdgpu-gds-size").getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GDSSize); } void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 421541c5e28..329b38c9fcc 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -262,6 +262,7 @@ private: unsigned GITPtrHigh; unsigned HighBitsOf32BitAddress; + unsigned GDSSize; // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -489,6 +490,10 @@ public: return HighBitsOf32BitAddress; } + unsigned getGDSSize() const { + return GDSSize; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } diff --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll new file mode 100644 index 00000000000..abd00d9fbb7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll @@ -0,0 +1,128 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_add_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw volatile add i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_gds_const_offset: +; GCN: s_movk_i32 m0, 0x80 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 gds +define amdgpu_kernel void @atomic_add_ret_gds_const_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #0 { + %gep = getelementptr i32, i32 addrspace(2)* %gds, i32 5 + %val = atomicrmw volatile add i32 addrspace(2)* %gep, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_sub_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw sub i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_and_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw and i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_or_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw or i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_xor_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw xor i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_umin_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw umin i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_umax_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw umax i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_imin_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_imin_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw min i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_imax_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_imax_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw max i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_xchg_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = atomicrmw xchg i32 addrspace(2)* %gds, i32 5 acq_rel + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_ret_gds: +; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s +; GCN-DAG: s_movk_i32 m0, 0x1000 +; GCN: ds_cmpst_rtn_b32 v{{[0-9]+}}, v[[OFF:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} gds +define amdgpu_kernel void @atomic_cmpxchg_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 { + %val = cmpxchg i32 addrspace(2)* %gds, i32 0, i32 1 acquire acquire + %x = extractvalue { i32, i1 } %val, 0 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind "amdgpu-gds-size"="128" } +attributes #1 = { nounwind "amdgpu-gds-size"="4096" } |