diff options
author | Artem Belevich <tra@google.com> | 2016-09-28 17:25:38 +0000 |
---|---|---|
committer | Artem Belevich <tra@google.com> | 2016-09-28 17:25:38 +0000 |
commit | 3e1211581c7e9b65511bca0b7d24eaf4bc4e032f (patch) | |
tree | 9ba5a2744358610672bab7a1104ff4c6716395ad /llvm/lib | |
parent | f0022125e099c400dee4f8a0299c5b4579193a53 (diff) | |
download | bcm5719-llvm-3e1211581c7e9b65511bca0b7d24eaf4bc4e032f.tar.gz bcm5719-llvm-3e1211581c7e9b65511bca0b7d24eaf4bc4e032f.zip |
[NVPTX] Added intrinsics for atom.gen.{sys|cta}.* instructions.
These are only available on sm_60+ GPUs.
Differential Revision: https://reviews.llvm.org/D24943
llvm-svn: 282607
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTX.td | 9 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 37 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 196 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 2 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 8 | ||||
-rw-r--r-- | llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 23 |
7 files changed, 263 insertions, 16 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td index 032991a20cc..c77ddbc9978 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.td +++ b/llvm/lib/Target/NVPTX/NVPTX.td @@ -51,6 +51,9 @@ def SM61 : SubtargetFeature<"sm_61", "SmVersion", "61", def SM62 : SubtargetFeature<"sm_62", "SmVersion", "62", "Target SM 6.2">; +def SATOM : SubtargetFeature<"satom", "HasAtomScope", "true", + "Atomic operations with scope">; + // PTX Versions def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32", "Use PTX version 3.2">; @@ -81,9 +84,9 @@ def : Proc<"sm_37", [SM37, PTX41]>; def : Proc<"sm_50", [SM50, PTX40]>; def : Proc<"sm_52", [SM52, PTX41]>; def : Proc<"sm_53", [SM53, PTX42]>; -def : Proc<"sm_60", [SM60, PTX50]>; -def : Proc<"sm_61", [SM61, PTX50]>; -def : Proc<"sm_62", [SM62, PTX50]>; +def : Proc<"sm_60", [SM60, PTX50, SATOM]>; +def : Proc<"sm_61", [SM61, PTX50, SATOM]>; +def : Proc<"sm_62", [SM62, PTX50, SATOM]>; def NVPTXInstrInfo : InstrInfo { } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index db31d030e9c..d894ba378a3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3274,20 +3274,34 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return false; case Intrinsic::nvvm_atomic_load_add_f32: - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::f32; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.vol = 0; - Info.readMem = true; - Info.writeMem = true; - Info.align = 0; - return true; - case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: { + auto &DL = I.getModule()->getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::i32; + Info.memVT = getValueType(DL, I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.vol = 0; @@ -3295,6 +3309,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.writeMem = true; Info.align = 0; return true; + } case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 8418e0e13a9..a71c85473d1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -131,6 +131,10 @@ def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">; def useAtomRedG64forGen64 : Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">; def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">; +def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">; +def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">; +def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; +def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; def reqPTX20 : Predicate<"Subtarget->reqPTX20()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index ed16afa2475..b0408f12f5b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1377,8 +1377,204 @@ defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas", defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>; +// Support for scoped atomic operations. Matches +// int_nvvm_atomic_{op}_{space}_{type}_{scope} +// and converts it into the appropriate instruction. +// NOTE: not all possible combinations are implemented +// 'space' is limited to generic as it's the only one needed to support CUDA. +// 'scope' = 'gpu' is default and is handled by regular atomic instructions. +class ATOM23_impl<string AsmStr, NVPTXRegClass regclass, list<Predicate> Preds, + dag ins, dag Operands> + : NVPTXInst<(outs regclass:$result), ins, + AsmStr, + [(set regclass:$result, Operands)]>, + Requires<Preds>; + +// Define instruction variants for all addressing modes. +multiclass ATOM2P_impl<string AsmStr, Intrinsic Intr, + NVPTXRegClass regclass, Operand ImmType, + SDNode Imm, ValueType ImmTy, + list<Predicate> Preds> { + let AddedComplexity = 1 in { + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, regclass:$b), + (Intr Int32Regs:$src, regclass:$b)>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, regclass:$b), + (Intr Int64Regs:$src, regclass:$b)>; + } + // tablegen can't infer argument types from Intrinsic (though it can + // from Instruction) so we have to enforce specific type on + // immediates via explicit cast to ImmTy. + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, ImmType:$b), + (Intr Int32Regs:$src, (ImmTy Imm:$b))>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, ImmType:$b), + (Intr Int64Regs:$src, (ImmTy Imm:$b))>; +} + +multiclass ATOM3P_impl<string AsmStr, Intrinsic Intr, + NVPTXRegClass regclass, Operand ImmType, + SDNode Imm, ValueType ImmTy, + list<Predicate> Preds> { + // Variants for register/immediate permutations of $b and $c + let AddedComplexity = 2 in { + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, regclass:$b, regclass:$c), + (Intr Int32Regs:$src, regclass:$b, regclass:$c)>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, regclass:$b, regclass:$c), + (Intr Int64Regs:$src, regclass:$b, regclass:$c)>; + } + let AddedComplexity = 1 in { + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, ImmType:$b, regclass:$c), + (Intr Int32Regs:$src, (ImmTy Imm:$b), regclass:$c)>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, ImmType:$b, regclass:$c), + (Intr Int64Regs:$src, (ImmTy Imm:$b), regclass:$c)>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, regclass:$b, ImmType:$c), + (Intr Int32Regs:$src, regclass:$b, (ImmTy Imm:$c))>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, regclass:$b, ImmType:$c), + (Intr Int64Regs:$src, regclass:$b, (ImmTy Imm:$c))>; + } + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int32Regs:$src, ImmType:$b, ImmType:$c), + (Intr Int32Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>; + def : ATOM23_impl<AsmStr, regclass, Preds, + (ins Int64Regs:$src, ImmType:$b, ImmType:$c), + (Intr Int64Regs:$src, (ImmTy Imm:$b), (ImmTy Imm:$c))>; +} + +// Constructs instrinsic name and instruction asm strings. +multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr, + string ScopeStr, string SpaceStr, + NVPTXRegClass regclass, Operand ImmType, SDNode Imm, + ValueType ImmTy, list<Predicate> Preds> { + defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) + # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) + # "." # OpStr # "." # TypeStr + # " \t$result, [$src], $b;", + !cast<Intrinsic>( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)), + regclass, ImmType, Imm, ImmTy, Preds>; +} +multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr, + string ScopeStr, string SpaceStr, + NVPTXRegClass regclass, Operand ImmType, SDNode Imm, + ValueType ImmTy, list<Predicate> Preds> { + defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) + # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) + # "." # OpStr # "." # TypeStr + # " \t$result, [$src], $b, $c;", + !cast<Intrinsic>( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!eq(ScopeStr,""), "", "_" # ScopeStr)), + regclass, ImmType, Imm, ImmTy, Preds>; +} + +// Constructs variants for different address spaces. +// For now we only need variants for generic space pointers. +multiclass ATOM2A_impl<string OpStr, string IntTypeStr, string TypeStr, + string ScopeStr, NVPTXRegClass regclass, Operand ImmType, + SDNode Imm, ValueType ImmTy, list<Predicate> Preds> { + defm _gen_ : ATOM2N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen", + regclass, ImmType, Imm, ImmTy, Preds>; +} +multiclass ATOM3A_impl<string OpStr, string IntTypeStr, string TypeStr, + string ScopeStr, NVPTXRegClass regclass, Operand ImmType, + SDNode Imm, ValueType ImmTy, list<Predicate> Preds> { + defm _gen_ : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, ScopeStr, "gen", + regclass, ImmType, Imm, ImmTy, Preds>; +} + +// Constructs variants for different scopes of atomic op. +multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr, + NVPTXRegClass regclass, Operand ImmType, SDNode Imm, + ValueType ImmTy, list<Predicate> Preds> { + // .gpu scope is default and is currently covered by existing + // atomics w/o explicitly specified scope. + defm _cta : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "cta", + regclass, ImmType, Imm, ImmTy, + !listconcat(Preds,[hasAtomScope])>; + defm _sys : ATOM2A_impl<OpStr, IntTypeStr, TypeStr, "sys", + regclass, ImmType, Imm, ImmTy, + !listconcat(Preds,[hasAtomScope])>; +} +multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr, + NVPTXRegClass regclass, Operand ImmType, SDNode Imm, ValueType ImmTy, + list<Predicate> Preds> { + // No need to define ".gpu"-scoped atomics. They do the same thing + // as the regular, non-scoped atomics defined elsewhere. + defm _cta : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "cta", + regclass, ImmType, Imm, ImmTy, + !listconcat(Preds,[hasAtomScope])>; + defm _sys : ATOM3A_impl<OpStr, IntTypeStr, TypeStr, "sys", + regclass, ImmType, Imm, ImmTy, + !listconcat(Preds,[hasAtomScope])>; +} +// atom.add +multiclass ATOM2_add_impl<string OpStr> { + defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>; + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>; + defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, []>; + defm _f32 : ATOM2S_impl<OpStr, "f", "f32", Float32Regs, f32imm, fpimm, f32, + [hasAtomAddF32]>; + defm _f64 : ATOM2S_impl<OpStr, "f", "f64", Float64Regs, f64imm, fpimm, f64, + [hasAtomAddF64]>; +} + +// atom.{and,or,xor} +multiclass ATOM2_bitwise_impl<string OpStr> { + defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>; + defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, + [hasAtomBitwise64]>; +} + +// atom.exch +multiclass ATOM2_exch_impl<string OpStr> { + defm _b32 : ATOM2S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>; + defm _b64 : ATOM2S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>; +} + +// atom.{min,max} +multiclass ATOM2_minmax_impl<string OpStr> { + defm _s32 : ATOM2S_impl<OpStr, "i", "s32", Int32Regs, i32imm, imm, i32, []>; + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>; + defm _s64 : ATOM2S_impl<OpStr, "i", "s64", Int64Regs, i64imm, imm, i64, + [hasAtomMinMax64]>; + defm _u64 : ATOM2S_impl<OpStr, "i", "u64", Int64Regs, i64imm, imm, i64, + [hasAtomMinMax64]>; +} + +// atom.{inc,dec} +multiclass ATOM2_incdec_impl<string OpStr> { + defm _u32 : ATOM2S_impl<OpStr, "i", "u32", Int32Regs, i32imm, imm, i32, []>; +} + +// atom.cas +multiclass ATOM3_cas_impl<string OpStr> { + defm _b32 : ATOM3S_impl<OpStr, "i", "b32", Int32Regs, i32imm, imm, i32, []>; + defm _b64 : ATOM3S_impl<OpStr, "i", "b64", Int64Regs, i64imm, imm, i64, []>; +} +defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; +defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; +defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; +defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; +defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">; +defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; +defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; +defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; +defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; +defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; //----------------------------------- // Support for ldu on sm_20 or later diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index bd2509a3c8c..6e1f427ed02 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -29,8 +29,6 @@ void NVPTXSubtarget::anchor() {} NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { // Provide the default CPU if we don't have one. - if (CPU.empty() && FS.size()) - llvm_unreachable("we are not using FeatureStr"); TargetName = CPU.empty() ? "sm_20" : CPU; ParseSubtargetFeatures(TargetName, FS); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 41670390c41..da020a94bcd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -48,6 +48,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // FrameLowering class because TargetFrameLowering is abstract. NVPTXFrameLowering FrameLowering; +protected: + // Processor supports scoped atomic operations. + bool HasAtomScope; + public: /// This constructor initializes the data members to match that /// of the specified module. @@ -77,6 +81,10 @@ public: bool hasAtomRedGen32() const { return SmVersion >= 20; } bool hasAtomRedGen64() const { return SmVersion >= 20; } bool hasAtomAddF32() const { return SmVersion >= 20; } + bool hasAtomAddF64() const { return SmVersion >= 60; } + bool hasAtomScope() const { return HasAtomScope; } + bool hasAtomBitwise64() const { return SmVersion >= 32; } + bool hasAtomMinMax64() const { return SmVersion >= 32; } bool hasVote() const { return SmVersion >= 12; } bool hasDouble() const { return SmVersion >= 13; } bool reqPTX20() const { return SmVersion >= 20; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 580d345cc66..48928ee2d54 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -42,6 +42,29 @@ static bool isNVVMAtomic(const IntrinsicInst *II) { case Intrinsic::nvvm_atomic_load_add_f32: case Intrinsic::nvvm_atomic_load_inc_32: case Intrinsic::nvvm_atomic_load_dec_32: + + case Intrinsic::nvvm_atomic_add_gen_f_cta: + case Intrinsic::nvvm_atomic_add_gen_f_sys: + case Intrinsic::nvvm_atomic_add_gen_i_cta: + case Intrinsic::nvvm_atomic_add_gen_i_sys: + case Intrinsic::nvvm_atomic_and_gen_i_cta: + case Intrinsic::nvvm_atomic_and_gen_i_sys: + case Intrinsic::nvvm_atomic_cas_gen_i_cta: + case Intrinsic::nvvm_atomic_cas_gen_i_sys: + case Intrinsic::nvvm_atomic_dec_gen_i_cta: + case Intrinsic::nvvm_atomic_dec_gen_i_sys: + case Intrinsic::nvvm_atomic_inc_gen_i_cta: + case Intrinsic::nvvm_atomic_inc_gen_i_sys: + case Intrinsic::nvvm_atomic_max_gen_i_cta: + case Intrinsic::nvvm_atomic_max_gen_i_sys: + case Intrinsic::nvvm_atomic_min_gen_i_cta: + case Intrinsic::nvvm_atomic_min_gen_i_sys: + case Intrinsic::nvvm_atomic_or_gen_i_cta: + case Intrinsic::nvvm_atomic_or_gen_i_sys: + case Intrinsic::nvvm_atomic_exch_gen_i_cta: + case Intrinsic::nvvm_atomic_exch_gen_i_sys: + case Intrinsic::nvvm_atomic_xor_gen_i_cta: + case Intrinsic::nvvm_atomic_xor_gen_i_sys: return true; } } |